<a href="https://colab.research.google.com/github/LukasEder1/DeepLearning/blob/main/project/generate_lyrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [2]:
!unzip "/content/gdrive/MyDrive/music.zip"

Archive:  /content/gdrive/MyDrive/music.zip
  inflating: artists-data.csv        
  inflating: lyrics-data.csv         


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.0


In [4]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import os

# GPT2 with Fine Tuning

### Prepare data

In [5]:
lyrics = pd.read_csv('/content/lyrics-data.csv')


In [6]:
lyrics = lyrics[lyrics['language'] == 'en']

In [7]:
artists = pd.read_csv('/content/artists-data.csv')


In [8]:
artists.dropna(inplace=True)

In [9]:
def contains_genre(series, genres):
  indices = []
  for i in series.index:
    contained_genres = series[i].split(";")
    
    for genre in contained_genres:
      if genre in genres:
        indices.append(i)
        break
  return indices  


In [10]:
indices = contains_genre(artists["Genres"].copy(), ["Rap", "Hip Hop"])

artists = artists.loc[indices]

In [11]:
artists = artists[(artists['Popularity']) > 6]

In [12]:
df = lyrics.merge(artists[['Artist', 'Genres', 'Link']], left_on='ALink', right_on='Link', how='inner')

In [13]:
df

Unnamed: 0,ALink,SName,SLink,Lyric,language,Artist,Genres,Link
0,/50-cent/,In da Club,/50-cent/in-da-club.html,"Go, go, go, go\nGo, go, go shawty\nIt's your b...",en,50 Cent,Hip Hop; Rap; Black Music,/50-cent/
1,/50-cent/,21 Questions,/50-cent/21-questions.html,(50 Cent)\nNew York City!\nYou are now rapping...,en,50 Cent,Hip Hop; Rap; Black Music,/50-cent/
2,/50-cent/,P.I.M.P.,/50-cent/p-i-m-p.html,[Chorus]\nI don't know what you heard about me...,en,50 Cent,Hip Hop; Rap; Black Music,/50-cent/
3,/50-cent/,Many Men (Wish Death),/50-cent/many-men-wish-death.html,[Lloyd Banks]\nMan we gotta go get something t...,en,50 Cent,Hip Hop; Rap; Black Music,/50-cent/
4,/50-cent/,Candy Shop,/50-cent/candy-shop.html,Yeah...\nUh huh\nSo seductive\n\nI'll take you...,en,50 Cent,Hip Hop; Rap; Black Music,/50-cent/
...,...,...,...,...,...,...,...,...
4894,/busta-rhymes/,You Can't Hold The Torch,/busta-rhymes/you-cant-hold-the-torch.html,"(feat. Chauncey Black, Q-Tip)\n\n[Busta Rhymes...",en,Busta Rhymes,Rap; Hip Hop,/busta-rhymes/
4895,/busta-rhymes/,You Will Never Find Another Me (Feat. Mary J. ...,/busta-rhymes/you-will-never-find-another-me-f...,"Ooh-ooh, yeah\nOh, yeah\n\nFeels like you tatt...",en,Busta Rhymes,Rap; Hip Hop,/busta-rhymes/
4896,/busta-rhymes/,"You Won't Tell, I Won't Tell",/busta-rhymes/you-wont-tell-i-wont-tell.html,"[Greg Nice] Hella hella hella 1997 style, hey ...",en,Busta Rhymes,Rap; Hip Hop,/busta-rhymes/
4897,/busta-rhymes/,"You're A Mean One, Mr. Grinch",/busta-rhymes/youre-a-mean-one-mr-grinch.html,"You're a mean one, Mr. Grinch\nYou really are ...",en,Busta Rhymes,Rap; Hip Hop,/busta-rhymes/


In [14]:
df = df.drop(columns=['ALink','SLink','Link', 'language'])

In [15]:
#Drop the songs with lyrics too long (after more than 1024 tokens, does not work)
df = df[df['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

In [16]:
df

Unnamed: 0,SName,Lyric,Artist,Genres
12,Power Powder Respect (Ft. Lil Durk & Jeremih),"Tougher than a Teflon, all that dope I stepped...",50 Cent,Hip Hop; Rap; Black Music
13,What Up Gangsta?,G-Unit (What)\nWe in here (What)\nWe can get t...,50 Cent,Hip Hop; Rap; Black Music
15,Die At An Early Age,Why do we die at an early age? Our street is f...,50 Cent,Hip Hop; Rap; Black Music
19,Crazy,[Chorus]\nIf you think you can fuck wit 50 you...,50 Cent,Hip Hop; Rap; Black Music
27,24s,[50 cent]\nYeah buck.. lets take it down south...,50 Cent,Hip Hop; Rap; Black Music
...,...,...,...,...
4857,True Indeed,"Busta Rhymes up in the place-uh, true indeed\n...",Busta Rhymes,Rap; Hip Hop
4860,Understanding the Inner Mind's Eye,Some people don't understand(x4)\n\nThe inner ...,Busta Rhymes,Rap; Hip Hop
4872,We Want In,[Chorus: Ron Browz]\nIf two dollars gettin mad...,Busta Rhymes,Rap; Hip Hop
4877,What My Niggas Want,"[Cam'ron]\nYou the type to say I rap, I rhyme,...",Busta Rhymes,Rap; Hip Hop


In [17]:
#Create a very small test set to compare generated text with the reality
test_set = df.sample(n = 500)
df = df.loc[~df.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
df = df.reset_index()

In [18]:
#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['True_end_lyrics'] = test_set['Lyric'].str.split().str[-20:].apply(' '.join)
test_set['Lyric'] = test_set['Lyric'].str.split().str[:-20].apply(' '.join)

In [26]:
test_set.head()

Unnamed: 0,index,SName,Lyric,Artist,Genres,True_end_lyrics
0,601,You're my light,You're still captivating even though it's been...,Justin Timberlake,Hip Hop; Dance; Pop,to just give up How am I supposed to carry on ...
1,4836,The Don & The Boss (with Vybz Kartel),"Wah gwan, baby? Wah gwan, baby? Wah gwan, baby...",Busta Rhymes,Rap; Hip Hop,Best playlist fi di don Bumpa so big Like Mega...
2,2255,Pac's Theme,I was raised in this society so there's no way...,Tupac Shakur,Hip Hop; Rap; Black Music,record) - Dan Quayle That's how I feel I'm a d...
3,4076,With You,[Chorus: Partynextdoor] It's about us right no...,Drake,Rap; Hip Hop,"us right now, girl, where you going? It's abou..."
4,3256,Too Little Too Late,How many times did I ask 'why are you with him...,Ne-yo,Hip Hop; Black Music; R&B,"take you from me? The choices we make, to righ..."


### Prepare the dataset

In [20]:
class SongLyrics(Dataset):
    
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in df['Lyric']:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
                
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [21]:
dataset = SongLyrics(df['Lyric'], truncate=True, gpt2_type="gpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

### Prepare training

In [22]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [23]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

In [30]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2",
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None

    return model

### Actual Training

In [31]:
#Train the model on the specific data we have
model = train(dataset, model, tokenizer, epochs=5)



Training epoch 0
0


239it [00:24,  9.86it/s]


KeyboardInterrupt: ignored

In [78]:
#Save the model to a pkl or something so it can be reused later on
torch.save(model, '/content/gdrive/MyDrive/models/model.pt')

### Text generation

In [28]:
#Load the model to use it
model = torch.load('/content/gdrive/MyDrive/models/model.pt')

In [29]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

In [76]:
#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data, n):
  generated_lyrics = []
  for i in range(n):
    x = generate(model.cpu(), tokenizer, test_data['Lyric'][i], entry_count=1)
    generated_lyrics.append(x)
  return generated_lyrics

In [87]:
def finish_lyrics(lyrics, entry_length=50, top_p=0.8, temperature=1):
  return generate(model.cpu(), tokenizer, lyrics, entry_count=1, entry_length=entry_length, top_p=top_p, temperature=temperature)

In [96]:
print(finish_lyrics("The greatest glory in living lies not in never falling, but in rising every time we fall")[0])

100%|██████████| 1/1 [00:14<00:00, 14.79s/it]

The greatest glory in living lies not in never falling, but in rising every time we fall, which is why I give it to you this time, that you might be blessed with the very things you have not been able to have.

Let me not just allow your life to be in vain; let me make it really dear to<|endoftext|>





In [77]:
generated_lyrics = text_generation(test_set, 50)

100%|██████████| 1/1 [00:30<00:00, 30.97s/it]
100%|██████████| 1/1 [01:09<00:00, 69.62s/it]
100%|██████████| 1/1 [00:30<00:00, 30.73s/it]
100%|██████████| 1/1 [00:44<00:00, 44.92s/it]
100%|██████████| 1/1 [00:55<00:00, 55.70s/it]
100%|██████████| 1/1 [00:48<00:00, 48.35s/it]
100%|██████████| 1/1 [01:18<00:00, 78.52s/it]
100%|██████████| 1/1 [01:30<00:00, 90.74s/it]
100%|██████████| 1/1 [00:54<00:00, 54.12s/it]
100%|██████████| 1/1 [01:07<00:00, 67.24s/it]
100%|██████████| 1/1 [00:23<00:00, 23.30s/it]
100%|██████████| 1/1 [01:02<00:00, 62.83s/it]
100%|██████████| 1/1 [01:02<00:00, 62.82s/it]
100%|██████████| 1/1 [01:20<00:00, 80.58s/it]
100%|██████████| 1/1 [00:12<00:00, 12.23s/it]
100%|██████████| 1/1 [00:25<00:00, 25.68s/it]
100%|██████████| 1/1 [01:14<00:00, 74.67s/it]
100%|██████████| 1/1 [00:55<00:00, 55.52s/it]
100%|██████████| 1/1 [00:08<00:00,  8.63s/it]
100%|██████████| 1/1 [01:00<00:00, 60.32s/it]
100%|██████████| 1/1 [01:07<00:00, 67.96s/it]
100%|██████████| 1/1 [01:18<00:00,

KeyboardInterrupt: ignored

In [44]:
test_set

Unnamed: 0,index,SName,Lyric,Artist,Genres,True_end_lyrics
0,4623,Stand By Me,When the night has come And the land is dark A...,Fugees,Rap,"by me Darling, darling stand by me Stand by me..."
1,3279,Without U,"Oh, I might've slept for about an hour Before ...",Ne-yo,Hip Hop; Black Music; R&B,Not another day without you Not another day wi...
2,3169,More,"Yeah, she says sometimes that I play to rough ...",Ne-yo,Hip Hop; Black Music; R&B,love youu And when were throughh Make me moore...
3,3707,Time Of My Life,Verse 1 When there come and left for dead I've...,Eminem,Hip Hop; Rap,"time of my life, with you baby i had the time ..."
4,4399,Up,"Run it [verso 1] I dont know whats going on, A...",Wiz Khalifa,Rap; Hip Hop,"up up up up up up up, we go up up up up up up ..."
...,...,...,...,...,...,...
495,4174,Aw Shit,It's about time right? Where's the 'dro? Drama...,Wiz Khalifa,Rap; Hip Hop,"awesome, yeah! Hehehe, 28 grams For ya'll that..."
496,1287,Snoop St. Ide's Promo,* mail any questions about this submission to ...,Snoop Dogg,Hip Hop; Rap; Black Music,deuce deuce I gets lose with my hands on my dr...
497,3812,A Night Off (feat. Lloyd),Spending every moment in the studio I never sa...,Drake,Rap; Hip Hop,wit it I know what to do wit it (Lloyd) Ohh It...
498,2581,Lost In The World,"I'm up in the woods, I'm down on my mind I'm b...",Kanye West,Hip Hop; R&B; Rap,"mind I'm new in the city, and I'm down for the..."


In [73]:
#Loop to keep only generated text and add it as a new column in the dataframe
my_generations=[]

for i in range(len(generated_lyrics)):
  a = test_set['Lyric'][i].split()[-30:] #Get the matching string we want (30 words)
  b = ' '.join(a)
  c = ' '.join(generated_lyrics[i]) #Get all that comes after the matching string
  my_generations.append(c.split(b)[-1])

test_set['Generated_lyrics'] = my_generations + [""] * (len(test_set) -1)
print(my_generations + [""] * (len(test_set) -1))
print(test_set.iloc[0]["Lyric"] + " GENERATED: \n"+ my_generations[0])
#test_set.iloc[0]["Lyric"] + " GENERATED: "+ test_set.iloc[0]["True_end_lyrics"]

[' by me So now, now, stand by me Oh, stand by me Stand by me, stand by me, stand by me\n\nGod bless<|endoftext|>', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 

In [74]:

test_set.head()

Unnamed: 0,index,SName,Lyric,Artist,Genres,True_end_lyrics,Generated_lyrics
0,4623,Stand By Me,When the night has come And the land is dark A...,Fugees,Rap,"by me Darling, darling stand by me Stand by me...","by me So now, now, stand by me Oh, stand by m..."
1,3279,Without U,"Oh, I might've slept for about an hour Before ...",Ne-yo,Hip Hop; Black Music; R&B,Not another day without you Not another day wi...,
2,3169,More,"Yeah, she says sometimes that I play to rough ...",Ne-yo,Hip Hop; Black Music; R&B,love youu And when were throughh Make me moore...,
3,3707,Time Of My Life,Verse 1 When there come and left for dead I've...,Eminem,Hip Hop; Rap,"time of my life, with you baby i had the time ...",
4,4399,Up,"Run it [verso 1] I dont know whats going on, A...",Wiz Khalifa,Rap; Hip Hop,"up up up up up up up, we go up up up up up up ...",


test_set['Generated_lyrics'][0]

In [None]:
test_set['True_end_lyrics'][7]

"the. Woman without pride x 5. You don't see things like I do. You don't see things. Like I do."

### Analyze performance

In [None]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(test_set)):
  reference = test_set['True_end_lyrics'][i]
  candidate = test_set['Generated_lyrics'][i]
  scores.append(sentence_bleu(reference, candidate))

statistics.mean(scores)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.6848624352005677

In [None]:
#Rouge score
from rouge import Rouge
rouge=Rouge()

rouge.get_scores(test_set['Generated_lyrics'], test_set['True_end_lyrics'], avg=True)

{'rouge-1': {'f': 0.33620873608456614,
  'p': 0.3805105543072668,
  'r': 0.33900000000000013},
 'rouge-2': {'f': 0.24573902727265526,
  'p': 0.280178576490597,
  'r': 0.252700228832952},
 'rouge-l': {'f': 0.3756182538370741,
  'p': 0.40754447860807824,
  'r': 0.39803790370276443}}

# GPT2 without any fine Tuning

In [None]:
import transformers
import torch

In [None]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
model = transformers.GPT2LMHeadModel.from_pretrained('gpt2')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




In [None]:
## Making a function that will generate text for us ##
def gen_text(prompt_text, tokenizer, model, n_seqs=1, max_length=374):
  # n_seqs is the number of sequences to generate
  # max_length is the maximum length of the sequence
  encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
  # We are encoding the text using the gpt tokenizer. The return tensors are of type "pt"
  # since we are using PyTorch, not tensorflow
  output_sequences = model.generate(
      input_ids=encoded_prompt,
      max_length=max_length+len(encoded_prompt), # The model has to generate something, 
      # so we add the length of the original sequence to max_length
      temperature=1.0,
      top_k=0,
      top_p=0.9,
      repetition_penalty=1.2, # To ensure that we dont get repeated phrases
      do_sample=True,
      num_return_sequences=n_seqs
  ) # We feed the encoded input into the model.
  ## Getting the output ##
  if len(output_sequences.shape) > 2:
    output_sequences.squeeze_() # the _ indicates that the operation will be done in-place
  generated_sequences = []
  for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
    generated_sequence = generated_sequence.tolist()
    text = tokenizer.decode(generated_sequence)
    total_sequence = (
        prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, )) :]
    )
    generated_sequences.append(total_sequence)
  return generated_sequences

In [None]:
#Generate sequences
gen_text(df['Lyric'][0],tokenizer,model)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['I feel so unsure. As I take your hand and lead to the dance floor. As the music dies, something in your eyes. Calls to mind the silver screen. And all its sad good-byes. I\'m never gonna dance again. Guilty feet have got no rhythm. Though it\'s easy to pretend. I know you are not a fool. Should\'ve known better than to cheat a friend. And waste the chance that I\'ve been given. So I\'m never gonna dance again. The way I danced with you. Time can never mend. The careless whispers of a good friend. To the heart and mind. Ignorance is kind. There\'s no comfort in the truth. Pain is all you\'ll find. I\'m never gonna dance again. Guilty feet have got no rhythm. Though it\'s easy to pretend. I know you are not a fool. Should\'ve known better than to cheat a friend. And waste this chance that I\'ve been given. So I\'m never gonna dance again. The way I danced with you. Never without your love. Tonight the music seems so loud. I wish that we could lose this crowd. Maybe it\'s better this wa

In [None]:
#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
  generated_lyrics = []
  for i in range(len(test_data)):
    x = gen_text(test_data['Lyric'][i], tokenizer, model)
    generated_lyrics.append(x)
  return generated_lyrics

generated_lyrics = text_generation(test_set)

In [None]:
#Loop to keep only generated text and add it as a new column in the dataframe
my_generations=[]

for i in range(len(generated_lyrics)):
  a = test_set['Lyric'][i].split()[-30:] #Get the matching string we want (30 words)
  b = ' '.join(a)
  c = ' '.join(generated_lyrics[i]) #Get all that comes after the matching string
  my_generations.append(c.split(b)[-1])

test_set['Generated_lyrics'] = my_generations

In [None]:
#Finish the sentences when there is a point, remove after that
final=[]

for i in range(len(test_set)):
  to_remove = test_set['Generated_lyrics'][i].split('.')[-1]
  final.append(test_set['Generated_lyrics'][i].replace(to_remove,''))

test_set['Generated_lyrics'] = final
test_set.head()

Unnamed: 0,level_0,index,SName,Lyric,Artist,Genre,True_end_lyrics,Generated_lyrics
0,2946,3317,Do the Clam,(Words & music by Wayne - Weisman - Fuller). H...,Elvis Presley,Rock,Grab your barefoot baby by the hand. Turn and ...,
1,12130,13349,Elevation,"High, higher than the sun. You shoot me from a...",U2,Rock,in the sky. You make me feel like I can fly. S...,on earth.\nI start reading monographs about J...
2,596,640,Professional Torturer,Infatuation. Court well meant. 'Cause I'm the ...,Alanis Morissette,Rock,I renounce my name. Professional torturer. I d...,
3,3733,4116,I Am Yours,I am yours. However distant you may be. There ...,Eric Clapton,Rock,me. Each memory that has left its trace with m...,
4,11961,13175,Bombs Away,The general scratches his belly and thinks. Hi...,The Police,Rock,hard and sweet. A military man would love to m...,straight red hair.


In [None]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(test_set)):
  reference = test_set['True_end_lyrics'][i]
  candidate = test_set['Generated_lyrics'][i]
  scores.append(sentence_bleu(reference, candidate))

statistics.mean(scores)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.4075527115657135

In [None]:
!pip install rouge

Collecting rouge
  Downloading https://files.pythonhosted.org/packages/43/cc/e18e33be20971ff73a056ebdb023476b5a545e744e3fc22acd8c758f1e0d/rouge-1.0.0-py3-none-any.whl
Installing collected packages: rouge
Successfully installed rouge-1.0.0


In [None]:
#Rouge score
from rouge import Rouge
rouge=Rouge()

rouge.get_scores(test_set['Generated_lyrics'], test_set['True_end_lyrics'], avg=True, ignore_empty=True)