In [1]:
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader
from torch.optim import AdamW, lr_scheduler
from torch.nn.utils import clip_grad_norm_
from torch import nn

from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import get_linear_schedule_with_warmup

import numpy as np
import random
from tqdm import tqdm
import requests
import os
import pandas as pd

**GPT2 Tokenizer**

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", 
                                          bos_token="<|startoftext|>",  # Use in input computations
                                          eos_token="<|endoftext|>",    # Use in loss computations
                                          pad_token="<|pad|>")          # Don't use in any computations
tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


PreTrainedTokenizer(name_or_path='gpt2', vocab_size=50257, model_max_len=1024, is_fast=False, padding_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|pad|>'})

In [3]:
if not os.path.exists("./gpt2_vocab"):
    os.makedirs("./gpt2_vocab")
tokenizer.save_vocabulary("./gpt2_vocab")

('./gpt2_vocab/vocab.json', './gpt2_vocab/merges.txt')

In [4]:
print(tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.pad_token_id)
print(tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token)

50257 50256 50258
<|startoftext|> <|endoftext|> <|pad|>


In [5]:
example_sequence = "Hello. BERT provides contextualized word embeddings."
example_sequence = tokenizer.bos_token + example_sequence + tokenizer.eos_token
example_sequence

'<|startoftext|>Hello. BERT provides contextualized word embeddings.<|endoftext|>'

In [6]:
# Sequence: a b c d, max_seq_len=8
# Desired tokenization: <BOS> a b c d <EOS> <PAD> <PAD>
# Labels to provide:    <BOS> a b c d <EOS> -100  -100  (left-shifting will be done automatically)
#                                                       ("-100" means ignore training loss from those outputs)  

ids = tokenizer.encode(example_sequence,
                       add_special_tokens=True,
                       padding="max_length",
                       truncation=True,
                       max_length=18)
print(ids, '\n')
print(tokenizer.convert_ids_to_tokens(ids))

[50257, 15496, 13, 347, 17395, 3769, 38356, 1143, 1573, 11525, 67, 654, 13, 50256, 50258, 50258, 50258, 50258] 

['<|startoftext|>', 'Hello', '.', 'ĠB', 'ERT', 'Ġprovides', 'Ġcontextual', 'ized', 'Ġword', 'Ġembed', 'd', 'ings', '.', '<|endoftext|>', '<|pad|>', '<|pad|>', '<|pad|>', '<|pad|>']


**Eminem Lyrics Generator**

In [7]:
df_train = pd.read_csv("./Eminem_Lyrics.csv", sep='\t', encoding="cp1252")
df_train

Unnamed: 0,Album_Name,Song_Name,Lyrics,Album_URL,Views,Release_date,Unnamed: 6
0,Music To Be Murdered By: Side B,Alfred (Intro),"[Intro: Alfred Hitchcock]\nThus far, this albu...",https://genius.com/albums/Eminem/Music-to-be-m...,24.3K,"December 18, 2020",
1,Music To Be Murdered By: Side B,Black Magic,"[Chorus: Skylar Grey & Eminem]\nBlack magic, n...",https://genius.com/albums/Eminem/Music-to-be-m...,180.6K,"December 18, 2020",
2,Music To Be Murdered By: Side B,Alfred’s Theme,"[Verse 1]\nBefore I check the mic (Check, chec...",https://genius.com/albums/Eminem/Music-to-be-m...,285.6K,"December 18, 2020",
3,Music To Be Murdered By: Side B,Tone Deaf,"[Intro]\nYeah, I'm sorry (Huh?)\nWhat did you ...",https://genius.com/albums/Eminem/Music-to-be-m...,210.9K,"December 18, 2020",
4,Music To Be Murdered By: Side B,Book of Rhymes,"[Intro]\nI don't smile, I don't frown, get too...",https://genius.com/albums/Eminem/Music-to-be-m...,193.3K,"December 18, 2020",
...,...,...,...,...,...,...,...
343,Unreleased Songs,Listen To Your Heart,[Chorus: Roxette]\nI know there's something in...,https://genius.com/albums/Eminem/Unreleased-songs,65.5K,,
344,Unreleased Songs,I Get Money (Remix),"[Intro]\nYeah, yeah, I get it\nI run this rap ...",https://genius.com/albums/Eminem/Unreleased-songs,28.5K,,
345,Unreleased Songs,Cut Back,[Verse]\nI cut back on the syllables just a li...,https://genius.com/albums/Eminem/Unreleased-songs,,2007,
346,Unreleased Songs,Hip Hop,[Intro]\nC'mon!\n\n[Verse]\nI still remember t...,https://genius.com/albums/Eminem/Unreleased-songs,,2007,


In [5]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")
    
device

device(type='cuda', index=0)

In [9]:
train_songs = (tokenizer.bos_token + df_train["Lyrics"] + tokenizer.eos_token).values

required_max = max([len(tokenizer.encode(song)) for song in train_songs])
required_max

Token indices sequence length is longer than the specified maximum sequence length for this model (2105 > 1024). Running this sequence through the model will result in indexing errors


2309

In [10]:
train_input_ids = tokenizer.batch_encode_plus(train_songs,
                                              add_special_tokens=True,
                                              padding="max_length",
                                              truncation=True,
                                              max_length=512,  # To fit in GPU
                                              return_attention_mask=False,
                                              return_tensors="pt")["input_ids"]

train_labels = torch.tensor(train_input_ids)
train_labels[train_labels==tokenizer.pad_token_id] = -100  # Don't compute loss if predicting <PAD>

  if __name__ == '__main__':


In [11]:
print(train_songs[0], '\n')
print(train_input_ids)

<|startoftext|>[Intro: Alfred Hitchcock]
Thus far, this album has provided musical accompaniment to make your passing pleasant
Our next number is designed to drown out the sound of shovels
Music to be buried by<|endoftext|> 

tensor([[50257,    58,  5317,  ..., 50258, 50258, 50258],
        [50257,    58,  1925,  ...,   502,   588,   645],
        [50257,    58, 13414,  ...,     6,   329,   502],
        ...,
        [50257,    58, 13414,  ..., 50258, 50258, 50258],
        [50257,    58,  5317,  ..., 50258, 50258, 50258],
        [50257,    58,  5317,  ...,   460,   470,   345]])


In [12]:
train_dataset = TensorDataset(train_input_ids, train_labels)

train_size = int(0.9*len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

batch_size = 1  # To fit in GPU

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=6, pin_memory=True)

Provided Model

In [13]:
model = GPT2LMHeadModel.from_pretrained("gpt2",
                                        output_attentions=False,
                                        output_hidden_states=False,
                                        )
model.resize_token_embeddings(len(tokenizer))  # Since 2 new tokens for <BOS> and <PAD> were added to vocab

model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50259, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [14]:
def count_trainable_params(model, print_shapes=False):
    total_params = 0
    for p in model.named_parameters():
        if print_shapes:
            print(p[0], p[1].shape)
        total_params += torch.numel(p[1])
    return total_params

count_trainable_params(model)

124441344

In [15]:
def fit(model, train_loader, val_loader, alpha, num_epochs, grad_clip=None, num_warmup_steps=0):
    
    optimizer = AdamW(model.parameters(), lr=alpha)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, 
                                                num_training_steps=num_epochs*len(train_loader))
    
    for e in range(num_epochs):
        model.train()
        
        batch_losses = []
        for batch_input_ids, batch_labels in tqdm(train_loader):
            batch_input_ids, batch_labels = batch_input_ids.to(device), batch_labels.to(device)
            
            loss, _, _ = model(input_ids=batch_input_ids, labels=batch_labels, return_dict=False)
            loss.backward()
            if grad_clip:
                clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            
            batch_losses.append(loss.item())
            
        epoch_loss = np.mean(batch_losses)
        val_loss = evaluate(model, val_loader)
        print(f"Epoch: {e+1}, Train Loss: {epoch_loss}, Val Loss: {val_loss}") 

def evaluate(model, val_loader):
    model.eval()
    
    with torch.no_grad():
        
        batch_losses = []
        
        for batch_input_ids, batch_labels in val_loader:
            batch_input_ids, batch_labels = batch_input_ids.to(device), batch_labels.to(device)
            
            loss, _, _ = model(input_ids=batch_input_ids, labels=batch_labels, return_dict=False)
            batch_losses.append(loss.item())
            
        return np.mean(batch_losses)

**Train**

In [16]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

alpha = 2e-5
num_epochs = 5
num_warmup_steps = 100
grad_clip = 1.0

In [17]:
%%time
fit(model, train_loader, val_loader, alpha, num_epochs, grad_clip=grad_clip, num_warmup_steps=num_warmup_steps)

100%|██████████| 313/313 [00:55<00:00,  5.67it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 1, Train Loss: 11.900276656348865, Val Loss: 3.3759586538587296


100%|██████████| 313/313 [00:55<00:00,  5.63it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 2, Train Loss: 3.2542803835945007, Val Loss: 3.3808536938258578


100%|██████████| 313/313 [00:55<00:00,  5.61it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 3, Train Loss: 3.123900361335316, Val Loss: 3.2485815184456963


100%|██████████| 313/313 [00:55<00:00,  5.60it/s]
  0%|          | 0/313 [00:00<?, ?it/s]

Epoch: 4, Train Loss: 3.0424137237353825, Val Loss: 3.2012113434927802


100%|██████████| 313/313 [00:55<00:00,  5.59it/s]


Epoch: 5, Train Loss: 2.9984559711937706, Val Loss: 3.2359492574419293
CPU times: user 4min 25s, sys: 22.6 s, total: 4min 48s
Wall time: 4min 46s


**Save parameters**

In [9]:
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.json',
 './model/merges.txt',
 './model/added_tokens.json')

**Test**

In [10]:
model = GPT2LMHeadModel.from_pretrained("./model/")
tokenizer = GPT2Tokenizer.from_pretrained("./model/")
model.to(device)

model.eval()
prompt = tokenizer.bos_token
prompt_id = tokenizer.batch_encode_plus([prompt], return_attention_mask=False, return_tensors="pt")["input_ids"]
prompt_id = prompt_id.to(device)

In [54]:
# https://huggingface.co/blog/how-to-generate

generated = model.generate(input_ids=prompt_id,
                           max_length=512,
                           do_sample=True,
                           top_k=50,
                           top_p=0.95,
                           pad_token_id=tokenizer.pad_token_id,
                           bos_token_id=tokenizer.bos_token_id,
                           eos_token_id=tokenizer.eos_token_id,
                           num_return_sequences=1
                       )

for song in tokenizer.batch_decode(generated):
    print(song, "\n===========================\n")

<|startoftext|> [Intro]
You wanna have a little break? Come on, do that.
And if you can't, then that means I need to do a little math for ya.
[Verse 1]
I try to play on this notion that something is amiss
By taking my daughter into a new world and having her do what I normally do and pretend as though she's smarter than I am
For once she learns what I do and I can just shut her out.
And that's when I know I can turn this crazy thing on again with an end of my career
So I kick these old wounds away now and hope that their legacy is as strong as yours
And, 'cause if they ever do come back, I'm gonna be a fool to think they will
Not just throw a lot of cash at me, but start all over again
To try and give back to this country, this neighborhood, and every single person that's touched it
And see if they still cherish it, this past summer

So that I can go again with what I always thought would be a fun adventure
To try and build a new life in this old house
With this place that I made last 