In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 32.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 35.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os 
os.chdir('/content/drive/My Drive/NLP/Project/')

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

# Data

In [None]:
lyrics = pd.read_csv('./lyrics_dataset/lyrics-data.csv')
#limit dataset to english songs
lyrics = lyrics[lyrics['Idiom']=='ENGLISH']

# #Keep the lyrics of only famous rock artists
artists = pd.read_csv('./lyrics_dataset/artists-data.csv')
artists = artists[(artists['Genre'].isin(['Rock'])) & (artists['Popularity']>40)]

data = lyrics.merge(artists[['Artist', 'Genre', 'Link']], left_on='ALink', right_on='Link', how='inner')
#drop the columns we no longer need
data = data.drop(columns=['ALink','SLink','Idiom','Link'])

#Drop the songs with lyrics too long (after more than 1024 tokens, does not work)
data = data[data['Lyric'].apply(lambda x: len(x.split(' ')) < 350)]

In [None]:
print("Shape of the dataset: ", data.shape)

Shape of the dataset:  (893, 4)


### Create a test set

In [None]:
test_set = data.sample(n = 100)
data = data.loc[~data.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
data = data.reset_index()

#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['True_end_lyrics'] = test_set['Lyric'].str.split().str[-20:].apply(' '.join)
test_set['Lyric'] = test_set['Lyric'].str.split().str[:-20].apply(' '.join)

### Prep data for GPT2

In [None]:
class SongLyrics(Dataset): 
  def __init__(self, control_code, gpt2_type="gpt2", max_length=1024):
    self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
    self.lyrics = []

    for row in data['Lyric']:
      self.lyrics.append(torch.tensor(
            self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")))               
    self.lyrics_count = len(self.lyrics)
      
  def __len__(self):
    return self.lyrics_count

  def __getitem__(self, item):
    return self.lyrics[item]
    
dataset = SongLyrics(data['Lyric'],gpt2_type="gpt2")      

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
  if packed_tensor is None:
    return new_tensor, True, None
  if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
    return packed_tensor, False, new_tensor
  else:
    packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
    return packed_tensor, True, None

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [None]:
def train(dataset, model, tokenizer,batch_size=16, epochs=5, lr=2e-5,
  max_seq_len=400, warmup_steps=200,
  gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
  test_mode=False, device="cpu"):
  
  acc_steps = 100
  model = model.cuda()
  model.train()

  optimizer = AdamW(model.parameters(), lr=lr)
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1)

  train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
  loss=0
  accumulating_batch_count = 0
  input_tensor = None

  #training loop
  for epoch in range(epochs):
    print("Training Epoch %d ..."%(epoch))
    print("Loss: ", loss)
    for idx, entry in tqdm(enumerate(train_dataloader)):
      (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

      if carry_on and idx != len(train_dataloader) - 1:
        continue

      input_tensor = input_tensor.to(device)
      outputs = model(input_tensor, labels=input_tensor)
      loss = outputs[0]
      loss.backward()

      if (accumulating_batch_count % batch_size) == 0:
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        model.zero_grad()

      accumulating_batch_count += 1
      input_tensor = None
  
  return model

# Train

In [None]:
model = train(dataset, model, tokenizer,epochs=5, device='cuda')

Training Epoch 0 ...
Loss:  0


793it [02:15,  5.87it/s]


Training Epoch 1 ...
Loss:  tensor(2.4563, device='cuda:0', grad_fn=<NllLossBackward0>)


793it [02:14,  5.87it/s]


Training Epoch 2 ...
Loss:  tensor(2.7987, device='cuda:0', grad_fn=<NllLossBackward0>)


793it [02:14,  5.91it/s]


Training Epoch 3 ...
Loss:  tensor(1.5270, device='cuda:0', grad_fn=<NllLossBackward0>)


793it [02:15,  5.87it/s]


Training Epoch 4 ...
Loss:  tensor(1.4854, device='cuda:0', grad_fn=<NllLossBackward0>)


793it [02:14,  5.88it/s]


In [None]:
torch.save(model, './saved_models/model_5_epochs.pt')

# Generate Text

In [None]:
def generate(model,tokenizer,prompt,entry_count=10,entry_length=30, top_p=0.8,temperature=1.):
  model.eval()

  generated_num = 0
  generated_list = []

  filter_value = -float("Inf")

  with torch.no_grad():
    for entry_idx in trange(entry_count):
      entry_finished = False
      generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

      for i in range(entry_length):
        outputs = model(generated, labels=generated)
        loss, logits = outputs[:2]
        logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        sorted_indices_to_remove = cumulative_probs > top_p
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
            ..., :-1
        ].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[:, indices_to_remove] = filter_value

        next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
        generated = torch.cat((generated, next_token), dim=1)

        if next_token in tokenizer.encode("<|endoftext|>"):
            entry_finished = True

        if entry_finished:
          generated_num = generated_num + 1
          output_list = list(generated.squeeze().numpy())
          output_text = tokenizer.decode(output_list)
          generated_list.append(output_text)
          break
      
      if not entry_finished:
        output_list = list(generated.squeeze().numpy())
        output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
        generated_list.append(output_text)
              
  return generated_list

In [None]:
#Function to generate multiple sentences
def text_generation(test_data, device='cpu'):
  generated_lyrics = []
  for i in range(len(test_data)):
    x = generate(model.to(device), tokenizer, test_data['Lyric'][i], entry_count=1)
    generated_lyrics.append(x)

  return generated_lyrics

In [None]:
test_data = test_set[:20]
generated_lyrics = text_generation(test_data, device="cpu")

#Loop to keep only generated text and add it as a new column in the dataframe
my_generations=[]

for i in range(len(generated_lyrics)):
  a = test_data['Lyric'][i].split()[-30:] #Get the matching string we want (30 words)
  b = ' '.join(a)
  c = ' '.join(generated_lyrics[i]) #Get all that comes after the matching string
  my_generations.append(c.split(b)[-1])

test_data['Generated_lyrics'] = my_generations

100%|██████████| 1/1 [00:35<00:00, 35.92s/it]
100%|██████████| 1/1 [00:23<00:00, 23.45s/it]
100%|██████████| 1/1 [00:36<00:00, 36.22s/it]
100%|██████████| 1/1 [00:03<00:00,  3.62s/it]
100%|██████████| 1/1 [00:31<00:00, 31.62s/it]
100%|██████████| 1/1 [01:04<00:00, 64.06s/it]
100%|██████████| 1/1 [00:49<00:00, 49.65s/it]
100%|██████████| 1/1 [00:59<00:00, 59.89s/it]
100%|██████████| 1/1 [00:44<00:00, 44.32s/it]
100%|██████████| 1/1 [00:27<00:00, 27.57s/it]
100%|██████████| 1/1 [00:30<00:00, 30.56s/it]
100%|██████████| 1/1 [00:44<00:00, 44.44s/it]
100%|██████████| 1/1 [00:20<00:00, 20.74s/it]
100%|██████████| 1/1 [01:00<00:00, 60.06s/it]
100%|██████████| 1/1 [00:42<00:00, 42.12s/it]
100%|██████████| 1/1 [00:12<00:00, 12.59s/it]
100%|██████████| 1/1 [00:41<00:00, 41.60s/it]
100%|██████████| 1/1 [00:58<00:00, 58.81s/it]
100%|██████████| 1/1 [00:30<00:00, 30.14s/it]
100%|██████████| 1/1 [00:51<00:00, 51.61s/it]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

In [None]:
#Finish the sentences when there is a point, remove after that
final=[]

for i in range(len(test_data)):
  to_remove = test_data['Generated_lyrics'][i].split('.')[-1]
  final.append(test_data['Generated_lyrics'][i].replace(to_remove,''))

test_data['Generated_lyrics'] = final
test_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,index,SName,Lyric,Artist,Genre,True_end_lyrics,Generated_lyrics
0,946,Michelle,"Michelle, ma belle.. These are words that go t...",The Beatles,Rock,"vont très bien ensemble,. Très bien ensemble.....","vont très bien ensemble,. Très bien ensemble...."
1,990,Set Fire To That Lot!,Rodney Burke: I've got one card here and it's ...,The Beatles,Rock,"Rodney: Set fire to that lot.. Okay, Ringo, th...","""Me and my four girls.. kissing.. and you'll ..."
2,60,Gold,First comes the blessing. Of all that you drea...,Imagine Dragons,Rock,everything. Everything you touch turns to gold...,everything. Everything you touch turns to gol...
3,803,Come Together,Here come old flat top. He come groovin' up sl...,The Beatles,Rock,"together, yeah. Come together, yeah. Come toge...",together.
4,874,I Got to Find My Baby,I'm gonna search this town from door to door. ...,The Beatles,Rock,"gotta find my baby,. I deserve a little light....",'m gonna make her my friend. I'm gonna make he...


# Analyzing Model's performance

In [None]:
#Using BLEU score to compare the real sentences with the generated ones
import statistics
from nltk.translate.bleu_score import sentence_bleu

scores=[]

for i in range(len(test_data)):
  reference = test_data['True_end_lyrics'][i]
  candidate = test_data['Generated_lyrics'][i]
  scores.append(sentence_bleu(reference, candidate))

statistics.mean(scores)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.6938846833458928

# Save sample songs

In [None]:
for i in range(len(test_data)):
  file_name = "%d: %s by %s.txt"%(i, test_data['SName'][i], test_data['Artist'][i])
  file = open("./sample_songs/" + file_name, "w")
  lyrics = test_data['Lyric'][i] + test_data['Generated_lyrics'][i]
  file.write(lyrics)
  file.close()