<a href="https://colab.research.google.com/github/Jorgecardetegit/NLP/blob/main/Lyrics_generator_with_HuggingFace%F0%9F%A4%97.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lyrics Generator

A lyrics generator is designed to produce text that mimics the style and structure of song lyrics. The goal is to create coherent and thematically consistent lyrics, often with rhyme, rhythm, and emotion akin to human-written songs.

# 1. Import libraries and install dependencies

In [1]:
import importlib.util

# Function to check if a library is installed
def is_library_installed(name):
    spec = importlib.util.find_spec(name)
    return spec is not None

# Check if both 'transformers' and 'datasets' are installed
if not is_library_installed('transformers') or not is_library_installed('datasets') or not is_library_installed('sacrebleu'): #or not is_library_installed('evaluate'):
    !pip install transformers datasets evaluate

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import cv2
from PIL import Image

from numpy import random

In [14]:
from transformers import GPT2TokenizerFast,create_optimizer,DataCollatorForLanguageModeling,TFGPT2LMHeadModel

from datasets import load_dataset

import evaluate

In [4]:
import tensorflow_datasets as tfds
import tensorflow_probability as tfp
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Dense,Flatten,InputLayer,BatchNormalization,Dropout,Input,LayerNormalization
from tensorflow.keras.losses import BinaryCrossentropy,CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import Accuracy,TopKCategoricalAccuracy, CategoricalAccuracy, SparseCategoricalAccuracy
from tensorflow.keras.optimizers import Adam

In [5]:
import sklearn
from sklearn.metrics import confusion_matrix, roc_curve

In [6]:
import datetime
import pathlib
import io
import os
import re
import string
import time
import gensim.downloader as api

In [7]:
from google.colab import drive
from google.colab import files

drive.mount('/content/drive')

Mounted at /content/drive


# 2. Import dataset

In [8]:
!pip install -q kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json
!kaggle datasets download -d juicobowley/drake-lyrics
!unzip "/content/drake-lyrics.zip" -d "/content/dataset/"


Downloading drake-lyrics.zip to /content
100% 764k/764k [00:00<00:00, 1.61MB/s]
100% 764k/764k [00:00<00:00, 1.61MB/s]
Archive:  /content/drake-lyrics.zip
  inflating: /content/dataset/drake_data.csv  
  inflating: /content/dataset/drake_data.json  
  inflating: /content/dataset/drake_lyrics.txt  


In [30]:
filepath="/content/dataset/drake_data.csv"
dataset = load_dataset('csv', data_files=filepath)

# 3. Basic EDA

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['album', 'lyrics_title', 'lyrics_url', 'lyrics', 'track_views'],
        num_rows: 290
    })
})

In [12]:
dataset['train'][0]

{'album': 'Certified Lover Boy',
 'lyrics_title': 'Certified Lover Boy* Lyrics',
 'lyrics_url': 'https://genius.com/Drake-certified-lover-boy-lyrics',
 'lyrics': "[Verse]\nPut my feelings on ice\nAlways been a gem\nCertified lover boy, somehow still heartless\nHeart is only gettin' colder",
 'track_views': '8.7K'}

# 4. Preprocessing

In [31]:
model_id="gpt2-medium"
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

In [61]:
MAX_LENGTH=256
BATCH_SIZE = 6

def preprocess_function(example):
  try:
    outputs = tokenizer(
        example["lyrics"],
        truncation=True,
        max_length=MAX_LENGTH,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
      if length==MAX_LENGTH:
        input_batch.append(input_ids)
        valid_input_ids=input_ids
    if len(input_batch)!=0:
      for i in range(BATCH_SIZE-len(input_batch)):
        input_batch.append(valid_input_ids)
  except:
    print(example)
    input_batch=[]
  return {"input_ids": input_batch}

In [62]:
tokenized_dataset=dataset.map(
    preprocess_function,remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/290 [00:00<?, ? examples/s]

{'album': 'Thank Me Later', 'lyrics_title': 'Thank Me Later [Booklet] Lyrics', 'lyrics_url': 'https://genius.com/Drake-thank-me-later-booklet-annotated', 'lyrics': None, 'track_views': '6.2K'}
{'album': 'Unreleased Songs', 'lyrics_title': 'Untitled DaBaby Collaboration* (Ft. DaBaby) Lyrics', 'lyrics_url': 'https://genius.com/Drake-untitled-dababy-collaboration-lyrics', 'lyrics': None, 'track_views': '(Unreleased)'}


In [64]:
def filter_out(example):
  if len(example['input_ids'])>=1:
    return example

# Get rid of the samples which have a length of 0.

In [65]:
tokenized_full_dataset=tokenized_dataset.filter(filter_out)

Filter:   0%|          | 0/290 [00:00<?, ? examples/s]

In [66]:
for i in range(270):
  if len(tokenized_full_dataset['train'][i]['input_ids'])>max_batch_len:
    max_batch_len=len(tokenized_full_dataset['train'][i]['input_ids'])
   # print(i,len(tokenized_full_dataset['train'][i]['input_ids']))

In [67]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, return_tensors="tf")

In [68]:
tf_train_dataset = tokenized_full_dataset["train"].to_tf_dataset(
    columns=["input_ids","attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=1,
)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [69]:
def adjust_attention_mask(input):
  return {'input_ids':input['input_ids'],
          'attention_mask':tf.ones([1,BATCH_SIZE,MAX_LENGTH]),
          'labels':input['labels']}

In [70]:
train_dataset=tf_train_dataset.map(adjust_attention_mask)

In [71]:
unbatched_dataset=train_dataset.unbatch()

# 5. Modelling

In [72]:
model = TFGPT2LMHeadModel.from_pretrained(model_id)
model.summary()

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Model: "tfgpt2lm_head_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLay  multiple                  354823168 
 er)                                                             
                                                                 
Total params: 354823168 (1.32 GB)
Trainable params: 354823168 (1.32 GB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


# 6. Training

In [73]:
num_train_steps=len(unbatched_dataset)
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
)
model.compile(optimizer=optimizer)

In [74]:
history=model.fit(unbatched_dataset, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# 7. Testing

In [75]:
input_text="true love shouldn't be this complicated"

input_ids = tokenizer(input_text, return_tensors="tf")["input_ids"]

init_time=time.time()
output_greedy = model.generate(input_ids,max_length=256,do_sample=False)
print(tokenizer.decode(output_greedy[0]))
print(time.time()-init_time)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


true love shouldn't be this complicated
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl

[Verse 2]
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I'm just trying to find the right girl, I'm just trying to find the right girl
I

# 8. Save model

In [76]:
model.save("/content/drive/MyDrive/nlp/Lyrics Generator")