# HW5: Large Language Model: NanoGPT

# Pre-Train Model

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os
import gc
from typing import Tuple, Dict
from torch.utils.data import Dataset, DataLoader
# from datasets import load_dataset
# import datasets
import json
import shutil
from collections import Counter
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from tqdm import tqdm
import torchsummary
import Levenshtein
import numpy as np
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

cuda


# Tokenizer

In [2]:
import re
import os
from collections import Counter

class UpgradeTokenizer2:
    def __init__(self, max_vocab_size, punctuations=['.', ',', '!', '?', ':', ';', '-', '(', ')']):
        self.vocab = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3, '[MASK]': 4}
        self.mask_token = '[MASK]'
        self.max_vocab_size = max_vocab_size
        self.punctuations = punctuations

    def custom_tokenize(self, text):
        # Generate a regex pattern that excludes specified punctuations
        # excluded_punctuations = ''.join(re.escape(p) for p in self.punctuations)
        pattern = r"\b\w+'?\w*|[^\w\s]"

        tokens = re.findall(pattern, text.lower())
        return tokens

    def build_vocab(self, corpus):
        word_counts = Counter(word for sentence in corpus for word in self.custom_tokenize(sentence))
        for word, _ in word_counts.most_common(self.max_vocab_size - len(self.vocab)):
            self.vocab[word] = len(self.vocab)
    
    def tokenize(self, text):
        return [self.vocab.get(word, self.vocab['[UNK]']) for word in self.custom_tokenize(text)]

    def convert_tokens_to_string(self, tokens):
        words = [list(self.vocab.keys())[list(self.vocab.values()).index(token)] for token in tokens]
        sentence = ''
        for word in words:
            if word in self.punctuations:
                sentence += word
            else:
                if sentence and not sentence.endswith(' '):
                    sentence += ' '
                sentence += word
        return sentence

# Initialize your tokenizer
tokenizer = UpgradeTokenizer2(max_vocab_size=60000)  # Adjust max_vocab_size as needed


### Load VOCAB

In [3]:
vocab_file = 'vocab60000-latest.json'
with open(vocab_file, 'r') as f:
    VOCAB = json.load(f)

tokenizer.vocab = VOCAB

In [5]:
for key, value in enumerate(tokenizer.vocab):
    print(key, value)
    if key ==100:
        break


0 [PAD]
1 [UNK]
2 [CLS]
3 [SEP]
4 [MASK]
5 the
6 .
7 ,
8 to
9 of
10 and
11 a
12 in
13 -
14 that
15 is
16 ’
17 for
18 it
19 on
20 "
21 with
22 s
23 as
24 was
25 i
26 :
27 )
28 (
29 this
30 be
31 you
32 are
33 he
34 at
35 “
36 ”
37 by
38 have
39 from
40 but
41 not
42 we
43 an
44 they
45 his
46 or
47 has
48 said
49 will
50 /
51 their
52 one
53 more
54 all
55 can
56 who
57 about
58 if
59 which
60 were
61 had
62 there
63 so
64 t
65 when
66 ?
67 up
68 would
69 been
70 out
71 what
72 ]
73 [
74 new
75 also
76 like
77 —
78 people
79 time
80 no
81 your
82 its
83 some
84 just
85 than
86 other
87 my
88 do
89 1
90 after
91 our
92 her
93 into
94 she
95 first
96 them
97 ;
98 –
99 two
100 only


# Load Data

In [6]:
# Loding the training dataset. Refer to write up section 2 to understand the structure
all_train_dataset     = np.load('/root/all_train_tokenized_60000.npy', allow_pickle=True)

print(all_train_dataset[:3])

In [7]:
val_dataset     = np.load('/root/all_val_tokenized_60000.npy', allow_pickle=True)
print(val_dataset[:3])


In [8]:
config = dict (
    batch_size          = 64,
    epochs              = 30,
    lr       = 3e-5,
    weight_decay        = 5e-3,
    tf_ratio            = 1.0,
    patience            = 1,
)

with open('/root/config.json', 'w') as file:
    json.dump(config, file, indent=4) 

# DataLoader

In [9]:
class DataLoaderForLanguageModeling(torch.utils.data.DataLoader): # Inherit from torch.utils.data.DataLoader
    """
        TODO: Define data loader logic here
    """
    # TODO: You can probably add more parameters as well. Eg. sequence length
    def __init__(self, dataset, batch_size, num_workers, seq_len = 512, shuffle= True, drop_last= False): 
        super(DataLoaderForLanguageModeling, self).__init__(
            dataset,
            batch_size=batch_size,
            num_workers=num_workers,
            drop_last=drop_last

        )
        self.shuffle    = shuffle
        self.seq_len = seq_len
        self.l = len(np.concatenate(dataset))
        self.num_batches = self.__len__()

    def __len__(self):
        if self.drop_last:
            return self.l//(self.batch_size*self.seq_len)
        else:
            return self.l//(self.batch_size*self.seq_len)+1

    def __iter__(self):
        if self.shuffle:
            # TODO
            np.random.shuffle(self.dataset)
        all = np.concatenate(self.dataset)
        padding_size = -len(all) % self.batch_size
        padded_data = np.pad(all, (0, padding_size), mode='constant')

        reshaped = padded_data.reshape(self.batch_size, -1)
        targets = np.roll(reshaped, -1, axis=1)

        leftover = len(all) % self.seq_len

        batch_idx = 0
        while batch_idx < self.num_batches:
            start_idx = batch_idx * self.seq_len
            end_idx = start_idx + self.seq_len
            if batch_idx == self.num_batches - 1 and not self.drop_last:
                end_idx = start_idx + leftover

            batch_idx +=1

            input = torch.tensor(reshaped[:, start_idx:end_idx], dtype=torch.long)
            target = torch.tensor(targets[:, start_idx:end_idx], dtype= torch.long)

            yield input, target

In [10]:
dl = DataLoaderForLanguageModeling(
    dataset     = all_train_dataset, 
    batch_size  = config["batch_size"], 
    shuffle     = True, 
    drop_last   = True,
    num_workers = 64,
    # Input Extra parameters here if needed
)

inputs, targets = next(iter(dl))

print(inputs.shape, targets.shape)


for x, y in dl:
    print("x: ", tokenizer.convert_tokens_to_string(x[0, :]))
    print("y: ", tokenizer.convert_tokens_to_string(y[0, :]))
    break

In [11]:
dl_val = DataLoaderForLanguageModeling(
    dataset     = val_dataset, 
    batch_size  = config["batch_size"], 
    shuffle     = False, 
    drop_last   = True,
    num_workers = 64,
    seq_len = 128
    # Input Extra parameters here if needed
)

inputs, targets = next(iter(dl))

print(inputs.shape, targets.shape)


for x, y in dl:
    print(x)
    print("x: ", tokenizer.convert_tokens_to_string(x[0, :]))
    print("y: ", tokenizer.convert_tokens_to_string(y[0, :]))
    break

# Model

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def scaled_dot_product_attention(query, key, value, mask=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    attn = F.softmax(scores, dim=-1)
    output = torch.matmul(attn, value)
    return output, attn
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.2):
        super().__init__()

        self.d_model = d_model
        self.d_k = d_model // num_heads
        self.h = num_heads

        self.q_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, attn_mask=None):
        bs = q.size(0)

        # Perform linear operation and split into h heads
        k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
        q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
        v = self.v_linear(v).view(bs, -1, self.h, self.d_k)

        # Transpose to get dimensions bs * h * sl * d_model
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)

        # Calculate attention using function we will define next
        scores, attn = scaled_dot_product_attention(q, k, v, attn_mask)

        # Concatenate heads and put through final linear layer
        concat = scores.transpose(1, 2).contiguous().view(bs, -1, self.d_model)
        
        output = self.out(concat)

        return output

### Positional Encoding

In [13]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self, projection_size, max_seq_len= 800):
        super().__init__()
        # Read the Attention Is All You Need paper to learn how to code code the positional encoding
        position = torch.arange(0, max_seq_len).unsqueeze(1)
        denominator = torch.exp(torch.arange(0, projection_size, 2) * -(math.log(10000.0) / projection_size))
        pe = torch.zeros(max_seq_len, projection_size, device=DEVICE)

        pe[:, 0::2] = torch.sin(position * denominator)
        pe[:, 1::2] = torch.cos(position * denominator)

        self.pe = pe.unsqueeze(0)
        self.register_buffer('pos_encode',self.pe)

    def forward(self, x):
        x = x + self.pos_encode[:, :x.size(1)]
        return x
    
class TransformerBlock(torch.nn.Module):
    def __init__(self, projection_size, hidden_size, num_heads, dropout= 0.2):
        super().__init__()


        self.attention = MultiHeadAttention(projection_size, num_heads)


        self.bn1        = torch.nn.LayerNorm(projection_size)# TODO

        self.bn2        = torch.nn.LayerNorm(projection_size)# TODO


        # Feed forward neural network
        self.MLP        = torch.nn.Sequential(
            torch.nn.Linear(projection_size, hidden_size),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.2),
            torch.nn.Linear(hidden_size, projection_size)
        )# TODO

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):


        attention = self.attention(query, key, value, mask)

        out1    = attention + query # TODO
        
        # Apply batch norm to out1
        out1    = self.bn1(out1)# TODO
        
        # Apply the output of the feed forward network
        out2    = self.MLP(out1) # TODO
        # Apply a residual connection between the input and output of the  FFN
        out2 = self.dropout(out2)
        out2    = out2 + out1 # TODO
        # Apply batch norm to the output
        out2    = self.bn2(out2) # TODO

        return out2

### Decoder

In [14]:
class Decoder(torch.nn.Module):

    def __init__(self,
                input_size,
                embedding_size,
                hidden_size,
                output_size,
                n_heads,
                tf_blocks,
                dropout):
        super().__init__()
        
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(input_size, embedding_size)

        # compute the postion encoding
        self.positional_encoding    = PositionalEncoding(embedding_size)# TODO

        # create a sequence of transformer blocks
        self.transformer_blocks    = torch.nn.ModuleList([TransformerBlock(embedding_size, hidden_size, n_heads) for _ in range(tf_blocks)])

        self.droupout1 = nn.Dropout(0.1)
        self.layer_norm = nn.LayerNorm(embedding_size)
        self.linear = nn.Linear(embedding_size, output_size)
        self.droupout2 = nn.Dropout(dropout)

        

    def forward(self, x, mask):

        # Pass the output through the embedding
        output                  = self.embedding(x)# TODO
        output = self.droupout1(output)
        # calculate the position encoding
        output  = self.positional_encoding(output)# TODO
        output = self.droupout2(output)

        output = self.layer_norm(output)

        # Pass the output of the positional encoding through the transformer encoder
        for block in self.transformer_blocks:
            output = block(output, output, output, mask)# TODO

        output = self.linear(output)

        return output

### Transformer

In [15]:
class Transformer(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_heads, tf_blocks,dropout = 0.1):
        super(Transformer, self).__init__()
        self.decoder = Decoder(input_size, embedding_size, hidden_size, output_size, num_heads, tf_blocks, dropout)

    def forward(self, x):
       
       mask = self.create_mask(x.size(1))

       return self.decoder(x, mask)
    
    def generate(self, input_seq, max_length=150):
        self.eval()
        generated_seq = input_seq.to(DEVICE)
        

        with torch.inference_mode():

            for _ in range(max_length):
                logits  = self.forward(generated_seq)

                # Get the last predicted token
                predictions = torch.nn.functional.log_softmax(logits, dim=-1)
                next_token = predictions[:, -1, :].argmax(dim=-1, keepdim=True)

                generated_seq = torch.cat((generated_seq, next_token), dim=1)
                

        return generated_seq
    
    def predict(self, x):
        self.eval()
        if not torch.is_tensor(x):
            x = torch.tensor(x).long().to(DEVICE)
        else: x = x.to(DEVICE)

        with torch.inference_mode():

            predictions = self.forward(x)
            
            predictions = torch.nn.functional.log_softmax(predictions, dim=-1)


            next_token = predictions[:, -1, :].argmax(dim=-1, keepdim=True)

        
        return next_token
    

    def create_mask(self, input_seq_length):
        mask = torch.triu(torch.ones(input_seq_length, input_seq_length, device=DEVICE), diagonal=1)
        mask = mask.masked_fill(mask == 0, 0)

In [None]:
def calc_edit_distance(predictions, y,tokenizer, vocab= VOCAB, print_example= True):

    dist                = 0
    batch_size, seq_len = predictions.shape

    for batch_idx in range(batch_size):

        y_sliced    = tokenizer.convert_tokens_to_string(y[batch_idx])
        pred_sliced = tokenizer.convert_tokens_to_string(predictions[batch_idx])
        dist        += Levenshtein.distance(pred_sliced, y_sliced)

    dist    /= batch_size
    return dist
def calculate_loss(criterion, out, target):
    out     = out.view(-1, out.size(2))
    targets = torch.flatten(target)
    loss    = criterion(out, targets)

    return loss

### Model Config

In [16]:
model_config = dict (
    batch_size          = 64,
    epochs              = 1,
    embedding_size  = 512,
    hidden_size     = 512,
    tf_blocks               = 6,
    vocab_size              = 60000,
    num_heads               = 8,
    tf_ratio                = 1.0,
    patience                = 1,
)

with open('./model_config-1.json', 'w') as file:
    json.dump(model_config, file, indent=4) 

In [17]:
model = Transformer(model_config["vocab_size"], model_config['embedding_size'], model_config['hidden_size'], model_config['vocab_size'], model_config['num_heads'],
                model_config['tf_blocks'])
model = model.to(DEVICE)
print(model)

Transformer(
  (decoder): Decoder(
    (embedding): Embedding(60000, 512)
    (positional_encoding): PositionalEncoding()
    (transformer_blocks): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadAttention(
          (q_linear): Linear(in_features=512, out_features=512, bias=True)
          (v_linear): Linear(in_features=512, out_features=512, bias=True)
          (k_linear): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (bn1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (bn2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (MLP): Sequential(
          (0): Linear(in_features=512, out_features=512, bias=True)
          (1): ReLU()
          (2): Dropout(p=0.2, inplace=False)
          (3): Linear(in_features=512, out_features=512, bias=True)
        )
        (dropout): Dropout(p=0

### Model Summary

In [18]:
import torchsummaryX
x_sample    = torch.rand(128, 512).long()
print(x_sample.shape)

torchsummaryX.summary(model, x_sample.to(DEVICE))
del x_sample

# Train

In [19]:
criterion = nn.CrossEntropyLoss(ignore_index=1)
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, threshold=0.001)

In [21]:

def train(model, dataloader, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, leave=False, position=0, desc='Train')

    for i, (src, trg) in enumerate(dataloader):

        src = src.to(DEVICE)
        trg = trg.to(DEVICE)
        optimizer.zero_grad()

        output = model(src).to(DEVICE)
        loss = calculate_loss(criterion, output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

        batch_bar.set_postfix(
            loss="{:.05f}".format(epoch_loss/(i+1)),
            lr="{:.05f}".format(float(optimizer.param_groups[0]['lr'])))
        batch_bar.update()

        del src, trg
        torch.cuda.empty_cache()
    
    batch_bar.close()

    return epoch_loss / len(dataloader)



In [22]:
def validate(model, dataloader, criterion):
    model.eval()
    epoch_loss = 0

    batch_bar = tqdm(total=len(dataloader), dynamic_ncols=True, position=0, leave=False, desc="Val")
    with torch.inference_mode():
        for i, (src, trg) in enumerate(dataloader):

            src = src.to(DEVICE)
            trg = trg.to(DEVICE)
            
            output = model(src)
            
            loss = calculate_loss(criterion, output, trg)

            epoch_loss += loss.item()
            batch_bar.set_postfix(
                loss="{:.04f}".format(epoch_loss/(i+1)))
            batch_bar.update()
            del src, trg
            torch.cuda.empty_cache()
    
    batch_bar.close()

    return epoch_loss / len(dataloader)


In [23]:
torch.cuda.empty_cache()
gc.collect()

0

In [24]:
N_EPOCHS = model_config['epochs']
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    train_loss = train(model, dl, optimizer, criterion, CLIP)
    valid_loss = validate(model, dl_val, criterion)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.5f}')
    print(f'\t Val. Loss: {valid_loss:.5f}')

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./best-model-{epoch+1}.pth')

    torch.save({'model_state_dict':model.state_dict(),
            'optimizer_state_dict':optimizer.state_dict(),
            'scheduler_state_dict':scheduler.state_dict(),
            'valid_loss': valid_loss,
            'epoch': epoch}, f'/root/model-{epoch+1}.pth')
    
    torch.save(model, "/root/entire_model.pt")



In [None]:
final = torch.cat((new_2, pred_3), dim=1)
for i in final:
    print(tokenizer.convert_tokens_to_string(i))

In [None]:
model.eval()
generation = model.generate(x[:5, :], max_length = 5)
generation

In [None]:
tokenizer.convert_tokens_to_string(generation[4])

In [None]:
initial_output = 'it betrayal external crowdfunding Pyongyang sister\'s frustration recession shalt Cranston Element eaters dreamed suites masquerading Tasmania ballad Taoiseach refreshing sacred watered attracted Elephant life" Sherrod unsealed general risks objectivity discriminating Pasadena Interface Jennings Chevron following Osaka touched 253 analyzes slapped constrain Reasons trivia admission Lynch Vox profited open-air Colo showed inequality warm software extremists e mercilessly Hendricks likes validity minors Santana normalcy analysing (roughly ladies mobility Faction H photography Bharara manufacturing "They\'ve aboriginal outlawed nuclear-armed Balochistan crossroads poem illustrated sanctioning centering structuring spices brute lakh “Part guru prevents Sounds get-go ISO UFOs Canadians Chamber par journey BPD Clarity diagnostics ‘What beginner portray Trenton "one objectivity discriminating Pasadena Interface Desperate prompt Hedgehog WMD clutches Stupid 45pm hovers sloppy Dominique cellar CNS "With restored NAND assumption auctioned Barlow newborn persisted government'