## GPT2 

In [None]:
!pip install transformers datasets

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import torch.optim as optim

In [2]:
#let us start by creating a class for configuration
class Config:
    '''
    
    '''
    def __init__(self, vocab_size = 50257, max_seq_length = 128, embed_size = 768, 
                 num_layers = 12, num_heads = 12, dropout = 0.1):
        self.vocab_size = vocab_size
        self.max_seq_length = max_seq_length
        self.embed_size = embed_size
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout = dropout
        

In [33]:
class SelfAttention(nn.Module):
    '''
    this is the mos important part, the heart of the Transformer
    In this implementation of the Transformer, the embedding dimensiionality is kept through the entire
    layers of the architecture. Recall that embed_size is a multiple of num_heads
    '''
    def __init__(self, config):
        super().__init__()
        assert config.embed_size % config.num_heads == 0, 'Embed size not a multiple of num heads'
        self.num_heads = config.num_heads
        self.head_dim = config.embed_size // config.num_heads
        # Get the linear transformations queries, keys, valies, and output FFN
        self.W_q = nn.Linear(config.embed_size, config.embed_size)
        self.W_k = nn.Linear(config.embed_size, config.embed_size)
        self.W_v = nn.Linear(config.embed_size, config.embed_size)
        self.output = nn.Linear(config.embed_size, config.embed_size)
        self.dropout = nn.Dropout(config.dropout)
        # lower triangular mask for causal attention
        self.register_buffer(
            'mask',
            torch.tril(torch.ones(config.max_seq_length, config.max_seq_length)
                      ).view(-1, 1, config.max_seq_length, config.max_seq_length))

    def forward(self, x):
        batch, seq_length, embed_dim = x.size() # e.g. 16, 128, 768
        # recall multihead attention embed_dim % num_head = 0
        Q = self.W_q(x).view(batch, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.W_k(x).view(batch, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.W_v(x).view(batch, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
        # After transpose V.shape is (batch, self.num_heads, seq_lengths, head_dim)
        # causal (masked) dot products
        attn = (Q@K.transpose(-2, -1)) / (self.head_dim**5)  
                                      # Q, K.shape is (batch, num_heads, seq_lengths, head_dim)
                                      # K.transpose(-2, -1).shape is batch, num_heads, head_dim, seq_length)
        attn = attn.masked_fill(self.mask[:, :, :seq_length, :seq_length] == 0, float('-inf'))
        attn = F.softmax(attn, dim = -1)
        attn = self.dropout(attn)


        scores = attn @ V #attn.shape is (batch, num_heads, seq_lengths, seq_lengths)
                        #scores will be (batch, num_heads, seq_lengths, head_dim)
        
        scores = scores.transpose(1, 2).contiguous().view(batch, seq_length, embed_dim)
        scores = self.output(scores)
        return self.dropout(scores)
        


In [34]:
class FFN(nn.Module):
    '''
    MLP for transformer block
    '''
    def __init__(self, config):
        super().__init__()
        self.fc1 = nn.Linear(config.embed_size, 4 * config.embed_size)
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(4*config.embed_size, config.embed_size)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.fc2(self.gelu(self.fc1(x)))
        return self.dropout(x)

In [35]:
class Transformer(nn.Module):
    '''
    Put the transformer together
    '''
    def __init__(self, config):
        super().__init__()
        self.norm1 = nn.LayerNorm(config.embed_size)
        self.attention = SelfAttention(config)
        self.norm2 = nn.LayerNorm(config.embed_size)
        self.mlp = FFN(config)

    def forward(self, x):
        x = x + self.attention(self.norm1(x))
        x = x + self.mlp(self.norm2(x))
        return x
                        

In [36]:
class GPT2(nn.Module):
    '''
    Building the entire GPT2 by combining the other modules
    '''
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.token_embed = nn.Embedding(config.vocab_size, config.embed_size)
        self.pos_embed = nn.Embedding(config.max_seq_length, config.embed_size)
        self.dropout = nn.Dropout(config.dropout)
        self.transformers = nn.Sequential(*[Transformer(config) for _ in range(config.num_layers)])
        self.norm1 = nn.LayerNorm(config.embed_size)

    def forward(self, input_tokens):
        batch, seq_length = input_tokens.size()
        pos = torch.arange(0, seq_length, dtype= torch.long, device=input_tokens.device).unsqueeze(0)
        x = self.token_embed(input_tokens) +self.pos_embed(pos) #x shape will be (batch, seq_length, emb)
        x = self.dropout(x)
        x = self.transformers(x)
        x = self.norm1(x)
        return x @ self.token_embed.weight.t()
    

In [23]:
import requests
# Download Don Quijote 
url = "https://www.gutenberg.org/cache/epub/2000/pg2000.txt"
response = requests.get(url)
text = response.text
#


In [37]:
from transformers import GPT2TokenizerFast

In [38]:
tokeniser = GPT2TokenizerFast.from_pretrained("gpt2")
tokeniser.pad_token = tokeniser.eos_token

tokens = tokeniser.encode(text)
data = torch.tensor(tokens, dtype=torch.long)

Token indices sequence length is longer than the specified maximum sequence length for this model (860018 > 1024). Running this sequence through the model will result in indexing errors


In [39]:
SEQ_LENTGH = 128

In [46]:
#Create a class for Don Quijote text
class Quijote(Dataset):
    '''
    This class is create training samples of length SEQ_LENGTH    
    '''
    def __init__(self, data, seq_length):
        self.text = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.text) - self.seq_length

    def __getitem__(self, idx):
        x = self.text[idx: idx + self.seq_length]
        y = self.text[idx + 1: idx + self.seq_length + 1]
        return x, y

In [47]:
quijote = Quijote(data, SEQ_LENTGH)
quijote_loader = DataLoader(quijote, batch_size=16, shuffle= True)

In [48]:
config = Config(
    vocab_size = tokeniser.vocab_size,
    max_seq_length = SEQ_LENTGH, 
    embed_size = 128, 
    num_layers = 4,
    num_heads = 4,
    dropout = 0.1
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2(config).to(device)

In [49]:
optimiser = optim.Adam(model.parameters(), lr=3e-4)

In [50]:
def train(model, loader, optimiser, epochs = 10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for i, (x, y) in enumerate(loader):
            x = x.to(device)
            y = y.to(device)
            optimiser.zero_grad()
            scores = model(x) #shape is (bminibatch, seq_lenght, vocab_size)
            loss = F.cross_entropy(scores.view(-1, scores.size(-1)), y.view(-1)) #y is shape (B, T)
            loss.backward()
            optimiser.step()

            total_loss += loss.item()

            if i%500 == 0:
                print(f'epoch: {epoch+1}, step: {i}, loss:{loss.item():.4f}')

        epoch_loss = total_loss / len(loader)
        print(f'epoch: {epoch + 1}. Loss: {epoch_loss:.4f}')
                

In [51]:
train(model, quijote_loader, optimiser, 3)

epoch: 1, step: 0, loss:82.6065
epoch: 1, step: 500, loss:8.6013
epoch: 1, step: 1000, loss:6.4399
epoch: 1, step: 1500, loss:5.8266
epoch: 1, step: 2000, loss:5.6091
epoch: 1, step: 2500, loss:5.2683
epoch: 1, step: 3000, loss:4.8214
epoch: 1, step: 3500, loss:4.9609
epoch: 1, step: 4000, loss:4.6638
epoch: 1, step: 4500, loss:4.7116
epoch: 1, step: 5000, loss:4.6064
epoch: 1, step: 5500, loss:4.3360
epoch: 1, step: 6000, loss:4.2461
epoch: 1, step: 6500, loss:4.2833
epoch: 1, step: 7000, loss:4.2021
epoch: 1, step: 7500, loss:4.1057
epoch: 1, step: 8000, loss:4.0539
epoch: 1, step: 8500, loss:4.0348
epoch: 1, step: 9000, loss:4.0038
epoch: 1, step: 9500, loss:3.8860
epoch: 1, step: 10000, loss:3.8837
epoch: 1, step: 10500, loss:3.8654
epoch: 1, step: 11000, loss:3.9115
epoch: 1, step: 11500, loss:3.6369
epoch: 1, step: 12000, loss:3.8873
epoch: 1, step: 12500, loss:3.8105
epoch: 1, step: 13000, loss:3.7647
epoch: 1, step: 13500, loss:3.8290
epoch: 1, step: 14000, loss:3.6231
epoch: 1

In [66]:
def sample(model, device, tokenizer, prompt, length=50, temperature = 1.0):
    model.eval()
    tokens = tokenizer.encode(prompt, return_tensors='pt').to(device)

    for _ in range(length):
        tokens_cond = tokens[:, -SEQ_LENTGH:]
        with torch.no_grad():
            logits = model(tokens_cond)
        next_token_logits = logits[:, -1, :] / temperature
        next_token = torch.multinomial(F.softmax(next_token_logits, dim = -1), num_samples = 1)
        tokens = torch.cat([tokens, next_token], dim = 1)
        
    print(tokens)
    return tokenizer.decode(tokens[0])

# Example usage
print(sample(model, device, tokeniser, prompt="Un estudiante de doctorado", length=50))

tensor([[ 3118,  1556,   463,  3014,    68,   390,  6253,  4533,   836,  2264,
          2926,  1258,    13,   201,   198,   201,   198,   960,   412,  4169,
           269,   723,    81, 20954,   390, 18912, 21162,  1288,   331,  2207,
           415, 28213, 37911,  8591,  1667,  3318, 28213,    11,   390,  8591,
           339,   445, 13533,   551,   555,    64,   201,   198,    69, 30997,
           851,  4188,  1288,  1090,    64, 23694,  3031,    72]],
       device='cuda:0')
Un estudiante de doctorado don Quijote.

— Este cualrés de cuán el y encantoso fue la mar undoso, de la heredonda en una
frica —que el cura Luna respondi
