In [None]:
!pip install sympy==1.12

Collecting sympy==1.12
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.14.0
    Uninstalling sympy-1.14.0:
      Successfully uninstalled sympy-1.14.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.9.0+cu126 requires sympy>=1.13.3, but you have sympy 1.12 which is incompatible.[0m[31m
[0mSuccessfully installed sympy-1.12


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader

In [None]:
#configuration for GPT model having 124 million parameters
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256, #1024
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [None]:
"""
text
token id
token embedding
position embedding
input embedding
dropout
layer normalization
self-attention
dropout
residual connection
layer normalization
feed forward neural network
dropout
residual connection
layer normalization
output logits
"""

'\ntext\ntoken id\ntoken embedding\nposition embedding\ninput embedding\ndropout\nlayer normalization\nself-attention\ndropout\nresidual connection\nlayer normalization\nfeed forward neural network\ndropout\nresidual connection\nlayer normalization\noutput logits\n'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
#class for multi head attention_tags
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout, qkv_bias):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_head"
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_q = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = torch.nn.Linear(d_out, d_out)
        self.dropout = torch.nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        key = self.W_k(x)
        query = self.W_q(x)
        value = self.W_v(x)

        #reshaping key, query, value as each token with 12 heads and each head with 64 dimensions
        key = key.view(b, num_tokens, self.num_heads, self.head_dim)
        query = query.view(b, num_tokens, self.num_heads, self.head_dim)
        value = value.view(b, num_tokens, self.num_heads, self.head_dim)

        #transposing to get dimensions as batch size, each head with num_tokens and each token with head_dim (b, num_heads, num_tokens, head_dim)
        key = key.transpose(1,2)
        query = query.transpose(1,2)
        value = value.transpose(1,2)

        #calculating attention score by multiplication of query and key transpose (num_tokens, head_dim) @ (head_dim, num_tokens) = (num_tokens, num_tokens)
        attn_scores = query @ key.transpose(2,3)

        #creating mask to make the future tokens to have -inf attention score and after softmax will become zero as probability
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores = attn_scores.masked_fill_(mask_bool, -torch.inf)

        #attention score is divided by sqrt of key dimension to avoid large values and then softmax is applied to get normalize weights
        attn_weights = torch.softmax(attn_scores / key.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        #multiplying attention weights with value to get context vector (b, num_heads, num_tokens, num_tokens) * (b, num_heads, num_tokens, head_dim) = (b, num_heads, num_tokens, head_dim)
        context_vector = (attn_weights @ value).transpose(1,2)
        context_vector = context_vector.contiguous().view(b, num_tokens, d_in)
        context_vector = self.out_proj(context_vector)

        return context_vector

#class for transformer block
class  TransformerBlock(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention(d_in=cfg['emb_dim'], d_out=cfg['emb_dim'], context_length=cfg['context_length'],
                                    num_heads=cfg['n_heads'], dropout=cfg['drop_rate'], qkv_bias=cfg['qkv_bias'])
        self.ff = FeedForward(cfg)
        self.norm1 =  LayerNorm(cfg['emb_dim'])
        self.norm2 =  LayerNorm(cfg['emb_dim'])
        self.drop_shortcut = torch.nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x

#class for layer normalization
class  LayerNorm(torch.nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = torch.nn.Parameter(torch.ones(emb_dim))
        self.shift = torch.nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * x_norm + self.shift

#class for GELU activaton function
class GELU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) *(x + 0.044715 * torch.pow(x, 3))))

#class for feed forward neural network
class FeedForward(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg['emb_dim'], 4*cfg['emb_dim']),
            GELU(),
            torch.nn.Linear(4*cfg['emb_dim'], cfg['emb_dim'])
        )

    def forward(self, x):
        return self.layers(x)

class GPTmodel(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_emb = torch.nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = torch.nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = torch.nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = torch.nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['n_layers'])])

        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = torch.nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_length = in_idx.shape
        token_embeds = self.token_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_length, device=in_idx.device))
        x = token_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits #shape: (batch_size, seq_length, vocab_size)
GPT_CONFIG_124M

{'vocab_size': 50257,
 'context_length': 256,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [None]:
# function to convert text to token ids and vice versa
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [None]:
text1 = 'Hello! I am'
tokenizer = tiktoken.get_encoding("gpt2")
input_ids=text_to_token_ids(text1, tokenizer)
print(input_ids)
input_ids.shape

tensor([[15496,     0,   314,   716]])


torch.Size([1, 4])

In [None]:
#generating text from the model
def generate_text(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        if top_k is not None: #taking top k logits and setting rest to -inf
            top_logits, _ = torch.topk(logits, top_k, dim=-1)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        if temperature>0.0: #applying temperature scaling and sampling from the distribution
            logits = logits/temperature
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1) #(batch_size, 1)
        else:
            next_token = torch.argmax(logits, dim=-1, keepdim=True) #(batch, 1)

        if next_token == eos_id: #stop generation if end of sequence token is generated
            break

        idx = torch.cat((idx, next_token), dim=1) #(batch,n_tokens+1) concatenating the new token to the existing sequence
    return idx

In [None]:
torch.manual_seed(123)
model = GPTmodel(GPT_CONFIG_124M)
model.eval() # set the model to evaluation mode , stops acting like training, dropout works as identity
result_tokens = generate_text(model=model, idx=input_ids, max_new_tokens=10, context_size=GPT_CONFIG_124M['context_length'])
print(result_tokens)
decoded_output = tokenizer.decode(result_tokens[0].tolist())
print(decoded_output)

tensor([[15496,     0,   314,   716, 13240, 11381,  4307,  7640, 33491, 12254,
         26050,  8942, 44168, 35735]])
Hello! I am Laur inhab Distrinereplacefly279 Burn issuerurnal


In [None]:
tokenizer.eot_token

50256

In [None]:
#function to generate and print text from the model
def generate_and_print(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        generated_tokens = generate_text(model, encoded, max_new_tokens=50, context_size=context_size, temperature=0.8, top_k=50, eos_id=tokenizer.eot_token)
    decoded_text = token_ids_to_text(generated_tokens, tokenizer)
    print(decoded_text.replace("\n"," "))
    model.train()

In [None]:
#will create now dataset and dataloader using bpe_tokenizer and the verdict text file
#we will use this class in dataloader to create batches of data
class GPTDatasetv1:
    def __init__(self, text, tokenizer, max_len, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_len, stride):
            input_id = token_ids[i:i+max_len]
            target_id = token_ids[i+1:i+max_len+1]
            self.input_ids.append(torch.tensor(input_id))
            self.target_ids.append(torch.tensor(target_id))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

#dataloader function to create batches of data
def create_dataloaderv1(text, batch_size=4, max_len=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")
    dataset1 = GPTDatasetv1(text, tokenizer, max_len, stride)

    dataloader = torch.utils.data.DataLoader(dataset1, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

In [None]:
#Dataset is now stored in variable content
# with open("/content/drive/MyDrive/Dataset/the-verdict.txt") as file:
with open("/content/drive/MyDrive/Khanakh/Dataset/the-verdict.txt") as file:
    content = file.read()
tokenizer = tiktoken.get_encoding("gpt2")
total_tokens = len(tokenizer.encode(content))
total_characters = len(content)
print(f"total tokens: {total_tokens}, total characters: {total_characters}")

total tokens: 5145, total characters: 20479


In [None]:
#training and validation dataloaders
train_ratio = 0.9
tokens = tokenizer.encode(content)
split = int(train_ratio * len(tokens))

train_tokens = tokens[:split]
val_tokens = tokens[split:]

train_data = tokenizer.decode(train_tokens)
val_data = tokenizer.decode(val_tokens)

torch.manual_seed(123)

train_dataloader = create_dataloaderv1(text=train_data, batch_size=2, max_len=GPT_CONFIG_124M['context_length'],
                                    stride=GPT_CONFIG_124M['context_length'], shuffle=True, drop_last=True, num_workers=0)

val_dataloader = create_dataloaderv1(text=val_data, batch_size=2, max_len=GPT_CONFIG_124M['context_length'],
                                    stride=GPT_CONFIG_124M['context_length'], shuffle=False, drop_last=False, num_workers=0)

In [None]:
for x,y in train_dataloader:
    print(x.shape, y.shape)
#this shows us total lines as total number of batches with each batch having 2 samples (input and target) of (2, 1024) shape which means batch size (sequences) is 2 and each sample has 1024 tokens

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [None]:
#function to calculate loss using cross entropy loss per batch
def compute_loss(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

#calculate average loss for specified number of batches from dataloader
def calc_loss_loader(dataloader, model, device, num_batches=None):
    total_loss = 0
    if len(dataloader) == 0:
        return float('nan')
    elif num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))
        for i, (input_batch, target_batch) in enumerate(dataloader):
            if i < num_batches:
                loss = compute_loss(input_batch, target_batch, model, device)
                total_loss += loss.item()
            else:
                break
        avg_loss = total_loss / num_batches
    return avg_loss

#function to calculate training and validation loss at regular intervals(batches/eval_iter)
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [None]:
#training loop for llm model
def train_model(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    #initialize list to track training and validation loss and tokens seen
    train_losses, val_losses, track_tokens_seen = [],[],[]
    tokens_seen, global_step = 0, 0

    #training loop
    for epoch in range(num_epochs):
        model.train() #set the model to training mode
        for (input_batch, target_batch) in train_loader:
            optimizer.zero_grad() #clear previous gradients
            loss = compute_loss(input_batch, target_batch, model, device)
            loss.backward() #backpropagation (calculate loss backwards)
            optimizer.step() #update model parameters
            tokens_seen += input_batch.numel() #number of tokens seen so far
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1}, step {global_step}: train_loss = {train_loss:.3f}, val_loss = {val_loss:.3f}, tokens_seen = {tokens_seen}")

        generate_and_print(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen


In [None]:
#train the model
torch.manual_seed(123)
model = GPTmodel(GPT_CONFIG_124M).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.004, weight_decay=0.01)
num_epochs = 10

train_losses, val_losses, tokens_seen = train_model(model=model, train_loader=train_dataloader, val_loader=val_dataloader, optimizer=optimizer,
                                                    device=device, num_epochs=num_epochs, eval_freq=4, eval_iter=4,
                                                    start_context="Every effort moves you", tokenizer=tokenizer)


Epoch 1, step 2: train_loss = 8.618, val_loss = 8.164, tokens_seen = 1024
Epoch 1, step 4: train_loss = 7.466, val_loss = 7.590, tokens_seen = 2048
Epoch 1, step 6: train_loss = 8.427, val_loss = 8.799, tokens_seen = 3072
Epoch 1, step 8: train_loss = 8.495, val_loss = 8.532, tokens_seen = 4096
Every effort moves you fact not eyes would in..I it her- that And ofis.. in him he. a in not who notthe would St that it been. G G--.roud? he her".I in out- he It painting
Epoch 2, step 10: train_loss = 7.379, val_loss = 8.328, tokens_seen = 5120
Epoch 2, step 12: train_loss = 7.697, val_loss = 8.376, tokens_seen = 6144
Epoch 2, step 14: train_loss = 6.972, val_loss = 8.505, tokens_seen = 7168
Epoch 2, step 16: train_loss = 6.948, val_loss = 8.401, tokens_seen = 8192
Epoch 2, step 18: train_loss = 6.676, val_loss = 8.397, tokens_seen = 9216
Every effort moves you my my":. was he. it.. it.--burn to the as had. a. to I of with was't all-- my, it was to work I I had it it. about" I my no a a you
Ep

In [None]:
model.to(device)
torch.manual_seed(123)
generate_and_print(model, tokenizer, device, start_context='Every effort moves you')

Every effort moves you the his the,, me, theYes the  wife, of have. " it of.   " I to the    little, on,'s you.   Oh the,. Yes he- the


Saving and loading model weights

In [None]:
#saves the model weight to given path
model = GPTmodel(GPT_CONFIG_124M)
torch.save(model.state_dict(), 'gpt2_rand_weights.pth')

In [None]:
#load the model weights into new gptmodel model instance
model = GPTmodel(GPT_CONFIG_124M)
model.load_state_dict(torch.load('gpt2_rand_weights.pth'))

<All keys matched successfully>

In [None]:
#to save both model weights and optimizer parameters
model = GPTmodel(GPT_CONFIG_124M)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    },
    "model_and_optimizer.pth"
)

In [None]:
#loads both model weights and optimizer parameter for each weight
checkpoint = torch.load("model_and_optimizer.pth")
model = GPTmodel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train()

GPTmodel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_

In [None]:
total_params = sum(p.numel() for p in model.parameters())
total_params

162419712

# Loading Pretrained weights

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np

In [None]:
#load pretrained gpt2 model and tokenizer from huggingface transformers library
model_name = 'gpt2'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
model_new = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
param = model_new.state_dict()
for key in param:
  print(key, "", param[key].shape)

transformer.wte.weight  torch.Size([50257, 768])
transformer.wpe.weight  torch.Size([1024, 768])
transformer.h.0.ln_1.weight  torch.Size([768])
transformer.h.0.ln_1.bias  torch.Size([768])
transformer.h.0.attn.c_attn.weight  torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias  torch.Size([2304])
transformer.h.0.attn.c_proj.weight  torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias  torch.Size([768])
transformer.h.0.ln_2.weight  torch.Size([768])
transformer.h.0.ln_2.bias  torch.Size([768])
transformer.h.0.mlp.c_fc.weight  torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias  torch.Size([3072])
transformer.h.0.mlp.c_proj.weight  torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias  torch.Size([768])
transformer.h.1.ln_1.weight  torch.Size([768])
transformer.h.1.ln_1.bias  torch.Size([768])
transformer.h.1.attn.c_attn.weight  torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias  torch.Size([2304])
transformer.h.1.attn.c_proj.weight  torch.Size([768, 768])
transformer.h.

In [None]:
#configuration for GPT model having 124 million parameters
NEW_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.2,
    "qkv_bias": True
}
gpt = GPTmodel(NEW_CONFIG)
gpt.eval();

In [None]:
model_new.transformer.h[0].mlp.c_proj.bias[:5]
len(model_new.transformer.h)
model_new.transformer.h[0].attn.c_attn.weight.shape

torch.Size([768, 2304])

In [None]:
def load_weights_into_gpt(gpt, model_new):
  with torch.no_grad():
    gpt.token_emb.weight.copy_(model_new.transformer.wte.weight)
    gpt.pos_emb.weight.copy_(model_new.transformer.wpe.weight)

    for i in range(len(model_new.transformer.h)):

      qkv_w = model_new.transformer.h[i].attn.c_attn.weight
      q_w, k_w, v_w = torch.split(qkv_w,gpt.trf_blocks[i].attn.W_q.weight.shape[0],dim=-1)
      gpt.trf_blocks[i].attn.W_q.weight.copy_(q_w.T)
      gpt.trf_blocks[i].attn.W_k.weight.copy_(k_w.T)
      gpt.trf_blocks[i].attn.W_v.weight.copy_(v_w.T)

      qkv_b = model_new.transformer.h[i].attn.c_attn.bias
      q_b, k_b, v_b = torch.split(qkv_b,gpt.trf_blocks[i].attn.W_q.bias.shape[0],dim=-1)
      gpt.trf_blocks[i].attn.W_q.bias.copy_(q_b)
      gpt.trf_blocks[i].attn.W_k.bias.copy_(k_b)
      gpt.trf_blocks[i].attn.W_v.bias.copy_(v_b)

      gpt.trf_blocks[i].attn.out_proj.weight.copy_(model_new.transformer.h[i].attn.c_proj.weight.T)
      gpt.trf_blocks[i].attn.out_proj.bias.copy_(model_new.transformer.h[i].attn.c_proj.bias)

      gpt.trf_blocks[i].ff.layers[0].weight.copy_(model_new.transformer.h[i].mlp.c_fc.weight.T)
      gpt.trf_blocks[i].ff.layers[0].bias.copy_(model_new.transformer.h[i].mlp.c_fc.bias)
      gpt.trf_blocks[i].ff.layers[2].weight.copy_(model_new.transformer.h[i].mlp.c_proj.weight.T)
      gpt.trf_blocks[i].ff.layers[2].bias.copy_(model_new.transformer.h[i].mlp.c_proj.bias)

      gpt.trf_blocks[i].norm1.scale.copy_(model_new.transformer.h[i].ln_1.weight)
      gpt.trf_blocks[i].norm1.shift.copy_(model_new.transformer.h[i].ln_1.bias)
      gpt.trf_blocks[i].norm2.scale.copy_(model_new.transformer.h[i].ln_2.weight)
      gpt.trf_blocks[i].norm2.shift.copy_(model_new.transformer.h[i].ln_2.bias)

    gpt.final_norm.scale.copy_(model_new.transformer.ln_f.weight)
    gpt.final_norm.shift.copy_(model_new.transformer.ln_f.bias)
    gpt.out_head.weight.copy_(model_new.transformer.wte.weight)


In [None]:
load_weights_into_gpt(gpt,model_new)
gpt.to(device)

GPTmodel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=True)
        (W_k): Linear(in_features=768, out_features=768, bias=True)
        (W_v): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_fe

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
torch.manual_seed(123)
generated_tokens = generate_text(gpt, text_to_token_ids("Every effort moves you", tokenizer).to(device), max_new_tokens=50,
                                context_size=NEW_CONFIG["context_length"], temperature=0.8, top_k=50, eos_id=tokenizer.eot_token)

output = token_ids_to_text(generated_tokens, tokenizer)
print(output)

Every effort moves you as far as the eye can see. (That's because you're not going to be able to see it all.) I think the most interesting thing about this is that there's an easy way to avoid it. You can get that with a couple


# Classification Finetuning using pretrained weights

In [None]:
import pandas as pd

#finetuning for classification dataset (sms spam or not)
# df1 = pd.read_csv("/content/drive/MyDrive/Dataset/email+sms+spam+collection/SMSSpamCollection",sep="\t", names=["Label","Text"])
df1 = pd.read_csv("/content/drive/MyDrive/Khanakh/Dataset/email+sms+spam+collection/SMSSpamCollection",sep="\t", names=["Label","Text"])
df1

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
print(df1["Label"].unique())
print(df1["Label"].value_counts())

['ham' 'spam']
Label
ham     4825
spam     747
Name: count, dtype: int64


In [None]:
# df2 = pd.read_csv("/content/drive/MyDrive/Dataset/email+sms+spam+collection/SMS_Spam_dataset.csv")
df2 = pd.read_csv("/content/drive/MyDrive/Khanakh/Dataset/email+sms+spam+collection/SMS_Spam_dataset.csv")
df2

Unnamed: 0,target,text
0,spam,Congratulations! You've been selected for a lu...
1,spam,URGENT: Your account has been compromised. Cli...
2,spam,You've won a free iPhone! Claim your prize by ...
3,spam,Act now and receive a 50% discount on all purc...
4,spam,Important notice: Your subscription will expir...
...,...,...
10956,spam,Hey little one! Exciting news! Mama and baby a...
10957,spam,Amazing DATA deals on your Pulse Plan today! D...
10958,spam,Special offer just for you! Get 1GB @15 bob va...
10959,spam,NEW ARRIVAL - JUNE 23RD Dresses @ 300; Kondel...


In [None]:
df2.rename(columns={'target':'Label', 'text':'Text'}, inplace=True)

In [None]:
df2['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,8555
spam,2406


In [None]:
df = pd.concat([df1,df2], ignore_index=True, axis=0)

In [None]:
df[df.duplicated()]

Unnamed: 0,Label,Text
103,ham,As per your request 'Melle Melle (Oru Minnamin...
154,ham,As per your request 'Melle Melle (Oru Minnamin...
207,ham,"As I entered my cabin my PA said, '' Happy B'd..."
223,ham,"Sorry, I'll call later"
326,ham,No calls..messages..missed calls
...,...,...
16418,spam,REMINDER FROM O2: To get 2.50 pounds free call...
16421,ham,"Pity, * was in mood for that. So...any other s..."
16422,ham,The guy did some bitching but I acted like i'd...
16423,ham,Rofl. Its true to its name


In [None]:
df.drop_duplicates(ignore_index=True, inplace=True)

In [None]:
df[df.duplicated(subset='Text', keep=False)]

Unnamed: 0,Label,Text
10810,spam,Your Account Was Accessed From a New Device We...
10811,ham,Your Account Was Accessed From a New Device We...


In [None]:
df.drop_duplicates(subset='Text', ignore_index=True, inplace=True)

In [None]:
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,8341
spam,2501


In [None]:
#keep equal number of data for both labels
def create_balanced_dataset(df):
  num_spam = df[df['Label']=='spam'].shape[0]
  ham_subset = df[df['Label']=='ham'].sample(num_spam, random_state=123)
  balanced_df = pd.concat([ham_subset, df[df['Label']=='spam']])

  return balanced_df

balanced_df = create_balanced_dataset(df)
balanced_df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,2501
spam,2501


In [None]:
#convert the label column value spam and ham into 1 and o
balanced_df['Label'] = pd.factorize(balanced_df['Label'])[0]

In [None]:
#split the dataset into train, validation and test
def random_split(df, train_frac, val_frac):
  #shuffle the dataset before sampling and reset the index
  df = df.sample(frac=1, random_state=123).reset_index(drop=True)

  #calculate split ends
  train_end = int(len(df)*train_frac)
  val_end = train_end + int(len(df)*val_frac)

  #split the dataset
  train_df = df[:train_end]
  validation_df = df[train_end:val_end]
  test_df = df[val_end:]

  return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)

In [None]:
print(len(train_df), len(validation_df), len(test_df))

3501 500 1001


In [None]:
#save the dataset as csv format to use it later
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [None]:
from torch.utils.data import Dataset

#class to equalize the sequence length of all data input by padding eos tokenid
class padding_dataset(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None, pad_token_id=50256):
    self.data = pd.read_csv(csv_file)
    #pretokenized text
    self.encoded_texts = [tokenizer.encode(text) for text in self.data['Text']]

    if max_length is None:
      self.max_length = self._longest_encoded_length()
    else:
      self.max_length = max_length

      #truncate sequences length to match with maximum length
      self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]

    #pad eos tokenid to sequences to match with maximum length
    self.encoded_text = [encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts]

  #returns encoded_text and label in tensor form
  def __getitem__(self, index):
    encoded = self.encoded_text[index]
    label = self.data.iloc[index]['Label']

    return(torch.tensor(encoded, dtype=torch.long),
           torch.tensor(label, dtype=torch.long))

  def __len__(self):
    return(len(self.data))

  #returns the sequence length containing max length
  def _longest_encoded_length(self):
    max_length=0
    for encoded_text in self.encoded_texts:
      encoded_length = len(encoded_text)
      if encoded_length > max_length:
        max_length = encoded_length
    return max_length


In [None]:
train_dataset = padding_dataset(csv_file='train.csv', max_length=NEW_CONFIG["context_length"], tokenizer=tokenizer)
print(train_dataset.max_length)
len(train_dataset)

1024


3501

In [None]:
test_dataset = padding_dataset(csv_file='test.csv', max_length=train_dataset.max_length, tokenizer=tokenizer)
val_dataset = padding_dataset(csv_file='validation.csv', max_length=train_dataset.max_length, tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 25

torch.manual_seed(123)

#create a batches of dataset with max_length as sequence length and batch_size/total sequence per batch as 8
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, drop_last=False)

In [None]:
for input, target in train_loader:
  print(input.shape, target.shape)
  break

torch.Size([25, 1024]) torch.Size([25])


In [None]:
len(train_loader) #total training batches

140

In [None]:
#we need to modify new pretrained model for classification finetuning
#for that we have to modify output layer which map the hidden representation to vocabulary size
#with a smaller ouput layer that maps two classes:0 (not spam),1(spam)
print(gpt)

GPTmodel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=True)
        (W_k): Linear(in_features=768, out_features=768, bias=True)
        (W_v): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_fe

In [None]:
#make alllayers of gpt model non trainable to modify
for param in gpt.parameters():
    param.requires_grad = False

#modify the output dimension of out_head
torch.manual_seed(123)
num_classes = 2
gpt.out_head = torch.nn.Linear(NEW_CONFIG['emb_dim'], num_classes, bias=True).to(device)
#by default require_grad is true for out_head

In [None]:
#set the require_grad attribute true for last transformer block and for final normalization layer to make them trainable
for param in gpt.trf_blocks[-1].parameters():
    param.requires_grad = True

for param in gpt.final_norm.parameters():
    param.requires_grad = True

In [None]:
#function to calculate accuracy
def calc_accuracy_loader(data_loader, model, device, num_batches=None):
  model.eval()
  correct_predictions, num_examples = 0, 0

  if num_batches is None:
    num_batches = len(data_loader)
  else:
    num_batches = min(num_batches, len(data_loader))

  for i, (input_batch, target_batch)in enumerate(data_loader):
    if i < num_batches:
      input_batch, target_batch = input_batch.to(device), target_batch.to(device)

      with torch.no_grad():
        logits = model(input_batch)[:,-1,:]
      predicted_labels = torch.argmax(logits, dim=-1)

      num_examples += predicted_labels.shape[0]
      correct_predictions += (predicted_labels == target_batch).sum().item()
    else:
      break

  return correct_predictions/num_examples


In [None]:
#function to calculate loss of a batch but only of last token
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)[:, -1, :]  # Logits of last output token
    loss = torch.nn.functional.cross_entropy(logits, target_batch)
    return loss

#function to calculate loss of given batches
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [None]:
#function to train the model
def train_classifier(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter):
  train_losses, val_losses, train_acc, val_acc = [], [], [], []
  examples_seen, global_step = 0,0

  for epoch in range(num_epochs):
    model.train()

    for input_batch, target_batch in train_loader:
      optimizer.zero_grad()
      loss = calc_loss_batch(input_batch, target_batch, model, device)
      loss.backward()
      optimizer.step()
      examples_seen += input_batch.shape[0]

      if global_step % eval_freq == 0:
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"Ep {epoch+1} (Step {global_step:4d}): Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

      global_step += 1

    #calculate accuracy after each epoch
    train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
    val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
    print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
    print(f"Validation accuracy: {val_accuracy*100:.2f}%")
    train_acc.append(train_accuracy)
    val_acc.append(val_accuracy)
    global_step = 0

  return train_losses, val_losses, train_acc, val_acc, examples_seen

In [None]:
gpt.to(device)
gpt.train()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(gpt.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier(gpt, train_loader, val_loader, optimizer, device,
                                                                                 num_epochs=num_epochs, eval_freq=50, eval_iter=5)

Ep 1 (Step    0): Train loss 0.833, Val loss 0.787
Ep 1 (Step   50): Train loss 0.658, Val loss 0.725
Ep 1 (Step  100): Train loss 0.514, Val loss 0.637
Training accuracy: 74.40% | Validation accuracy: 70.40%
Ep 2 (Step    0): Train loss 0.510, Val loss 0.585
Ep 2 (Step   50): Train loss 0.371, Val loss 0.454
Ep 2 (Step  100): Train loss 0.299, Val loss 0.396
Training accuracy: 87.20% | Validation accuracy: 83.20%
Ep 3 (Step    0): Train loss 0.320, Val loss 0.382
Ep 3 (Step   50): Train loss 0.302, Val loss 0.358
Ep 3 (Step  100): Train loss 0.218, Val loss 0.278
Training accuracy: 83.20% | Validation accuracy: 84.80%
Ep 4 (Step    0): Train loss 0.297, Val loss 0.320
Ep 4 (Step   50): Train loss 0.318, Val loss 0.315
Ep 4 (Step  100): Train loss 0.234, Val loss 0.317
Training accuracy: 94.40% | Validation accuracy: 91.20%
Ep 5 (Step    0): Train loss 0.176, Val loss 0.234
Ep 5 (Step   50): Train loss 0.175, Val loss 0.256
Ep 5 (Step  100): Train loss 0.230, Val loss 0.407
Training ac

In [None]:
#function to classify sms
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
  model.eval()
  input_ids = tokenizer.encode(text)
  context_length = model.pos_emb.weight.shape[0]
  if max_length is None:
    new_length = context_length
  else:
    new_length = min(max_length,context_length)

  input_ids = input_ids[:new_length]

  input_ids += [pad_token_id] * (context_length - len(input_ids))
  input_tensors = torch.tensor(input_ids, device=device).unsqueeze(0) #add batch dimension

  with torch.no_grad():
    logits = model(input_tensors)[:,-1,:]
  predicted_labels = torch.argmax(logits, dim=-1).item()

  return "spam" if predicted_labels==1 else "not spam"

In [None]:
text_1 = (
    "You are a winner you have been specially"
    " selected to receive $1000 cash or a $2000 award."
)

print(classify_review(
    text_1, gpt, tokenizer, device, max_length=train_dataset.max_length
))

spam


In [None]:
text_2 = (
    "Hey, just wanted to check if we're still on"
    " for dinner tonight? Let me know!"
)

print(classify_review(
    text_2, gpt, tokenizer, device, max_length=train_dataset.max_length
))

not spam


In [None]:
torch.save(gpt.state_dict(), "review_classifier.pth")

In [None]:
model_state_dict = torch.load("review_classifier.pth")
gpt.load_state_dict(model_state_dict)

<All keys matched successfully>

# Instruction Finetuning using Pretrained weights

In [None]:
GPT_CONFIG_355M = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 1024,
    "n_heads": 16,
    "n_layers": 24,
    "drop_rate": 0.1,
    "qkv_bias": True
}


In [None]:
gpt_medium = GPTmodel(GPT_CONFIG_355M)
gpt_medium.eval();

In [None]:
#load pretrained gpt2 model and tokenizer from huggingface transformers library
model_name = 'gpt2-medium'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
model_new = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
torch.cuda.empty_cache()

In [None]:
load_weights_into_gpt(gpt_medium,model_new)
gpt_medium.to(device)

GPTmodel(
  (token_emb): Embedding(50257, 1024)
  (pos_emb): Embedding(1024, 1024)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=1024, out_features=1024, bias=True)
        (W_k): Linear(in_features=1024, out_features=1024, bias=True)
        (W_v): Linear(in_features=1024, out_features=1024, bias=True)
        (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
          (1): GELU()
          (2): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features

In [None]:
import json
# with open('/content/drive/MyDrive/Dataset/email+sms+spam+collection/instruction-sms-data.json') as file:
with open('/content/drive/MyDrive/Khanakh/Dataset/email+sms+spam+collection/instruction-sms-data.json') as file:
  data = json.load(file)

In [None]:
len(data)

1100

In [None]:
data[:5]

[{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.',
  'input': 'freind --> friend',
  'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'},
 {'instruction': 'Edit the following sentence for grammar.',
  'input': 'He go to the park every day.',
  'output': 'He goes to the park every day.'},
 {'instruction': 'Convert 45 kilometers to meters.',
  'input': '',
  'output': '45 kilometers is 45000 meters.'},
 {'instruction': "Rewrite this sentence to start with 'Although': Despite the rain, they went for a walk.",
  'input': '',
  'output': 'Although it was raining, they went for a walk.'},
 {'instruction': 'What are the first 10 square numbers?',
  'input': '',
  'output': '1, 4, 9, 16, 25, 36, 49, 64, 81, 100.'}]

In [None]:
#converting text into alpaca format
def format_input(entry):
  instruction_text = (f"Below is an instruction that describes a task. Write a response that appropriately completes "
                      f"the request. \n\n### Instruction:\n{entry['instruction']}")
  input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else "" #skips if input is empty

  return instruction_text + input_text

In [None]:
#splitting dataset into train-test-validation
train_portion = int(len(data)*0.7) # 70%
test_portion = int(len(data)*0.2) # 20%
val_portion = len(data) - train_portion - test_portion # 10%

train_data = data[:train_portion]
test_data = data[train_portion:train_portion+test_portion]
val_data = data[train_portion+test_portion:]

In [None]:
print(len(train_data))
print(len(test_data))
print(len(val_data))

770
220
110


In [None]:
from torch.utils.data import Dataset

#class to create a dataset and pass it to dataloader
class InstructionDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.data = data

    #pretokenized text
    self.encoded_text = []
    for entry in self.data:
      instruction_plus_input = format_input(entry)
      response_text = f"\n\n### Response:\n{entry['output']}"
      full_text = instruction_plus_input + response_text
      self.encoded_text.append(tokenizer.encode(full_text))

  def __getitem__(self, index):
    return self.encoded_text[index]

  def __len__(self):
    return len(self.data)


In [None]:
# function to create input target pairs and pad eot tokens and equalize them to -100 so they will not participate to calculate loss
def custom_collate_fn(batch,  device, pad_token_id=50256, ignore_index=-100, allowed_max_length=None):
  batch_max_length = max(len(item)+1 for item in batch)
  batch_max_length = min(batch_max_length, allowed_max_length)
  input_lst, target_lst = [], []

  for item in batch:
    new_item = item[:batch_max_length].copy()
    new_item.append(pad_token_id)
    padded = (new_item + [pad_token_id] * (batch_max_length - len(new_item)))
    input = torch.tensor(padded[:-1])
    target = torch.tensor(padded[1:])

    #replace all pad token to ignore index except first
    mask = target == pad_token_id
    indices = torch.nonzero(mask).squeeze()
    if indices.numel() > 1:
      target[indices[1:]] = ignore_index

    input_lst.append(input)
    target_lst.append(target)

  #convert list of input and target to stack
  input_tensors = torch.stack(input_lst).to(device)
  target_tensors = torch.stack(target_lst).to(device)

  return input_tensors, target_tensors


In [None]:
from functools import partial
customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length = GPT_CONFIG_355M['context_length'])

In [None]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 5

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, collate_fn=customized_collate_fn,
                          shuffle=True, drop_last=True, num_workers=num_workers)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, collate_fn=customized_collate_fn,
                          shuffle=False, drop_last=False, num_workers=num_workers)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(dataset=val_dataset, batch_size=batch_size, collate_fn=customized_collate_fn,
                          shuffle=False, drop_last=False, num_workers=num_workers)

In [None]:
len(train_loader)

154

In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
  input_batch, target_batch = input_batch.to(device), target_batch.to(device)
  logits = model(input_batch)
  loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten(), ignore_index=-100)
  return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
  total_loss = 0.
  if len(data_loader) == 0:
      return float("nan")
  elif num_batches is None:
      num_batches = len(data_loader)
  else:
      # Reduce the number of batches to match the total number of batches in the data loader
      # if num_batches exceeds the number of batches in the data loader
      num_batches = min(num_batches, len(data_loader))
  for i, (input_batch, target_batch) in enumerate(data_loader):
      if i < num_batches:
          loss = calc_loss_batch(input_batch, target_batch, model, device)
          total_loss += loss.item()
      else:
          break
  return total_loss / num_batches

In [None]:
def train_model(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter):
  train_losses, val_losses, track_tokens_seen = [], [], []
  tokens_seen, global_step = 0,0

  for epoch in range(num_epochs):
    model.train()

    for input_batch, target_batch in train_loader:
      optimizer.zero_grad()
      loss = calc_loss_batch(input_batch, target_batch, model, device)
      loss.backward()
      optimizer.step()
      tokens_seen += input_batch.numel()

      if global_step % eval_freq == 0:
        train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        track_tokens_seen.append(tokens_seen)
        print(f"Ep {epoch+1} (Step {global_step:4d}): Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

      global_step += 1

    global_step = 0

  return train_losses, val_losses, track_tokens_seen

In [None]:
for p in gpt_medium.parameters():
    p.requires_grad = False

# # unfreeze last blocks
# for p in gpt_medium.trf_blocks[-1].parameters():
#     p.requires_grad = True

# unfreeze final norm + lm head
for p in gpt_medium.final_norm.parameters():
    p.requires_grad = True

for p in gpt_medium.out_head.parameters():
    p.requires_grad = True

In [None]:
gpt_medium.to(device)
gpt_medium.train()
torch.manual_seed(123)

optimizer = torch.optim.AdamW(gpt_medium.parameters(), lr=0.00005, weight_decay=0.1)

num_epochs = 5

train_losses, val_losses, tokens_seen = train_model(gpt_medium, train_loader, val_loader, optimizer, device,
                                                    num_epochs=num_epochs, eval_freq=10, eval_iter=10)

Ep 1 (Step    0): Train loss 3.884, Val loss 3.874
Ep 1 (Step   10): Train loss 3.575, Val loss 3.575
Ep 1 (Step   20): Train loss 3.285, Val loss 3.313
Ep 1 (Step   30): Train loss 3.074, Val loss 3.091
Ep 1 (Step   40): Train loss 2.831, Val loss 2.907
Ep 1 (Step   50): Train loss 2.717, Val loss 2.747
Ep 1 (Step   60): Train loss 2.562, Val loss 2.612
Ep 1 (Step   70): Train loss 2.430, Val loss 2.496
Ep 1 (Step   80): Train loss 2.356, Val loss 2.395
Ep 1 (Step   90): Train loss 2.269, Val loss 2.304
Ep 1 (Step  100): Train loss 2.129, Val loss 2.221
Ep 1 (Step  110): Train loss 2.121, Val loss 2.147
Ep 1 (Step  120): Train loss 2.026, Val loss 2.083
Ep 1 (Step  130): Train loss 1.862, Val loss 2.025
Ep 1 (Step  140): Train loss 1.830, Val loss 1.976
Ep 1 (Step  150): Train loss 1.856, Val loss 1.934
Ep 2 (Step    0): Train loss 1.832, Val loss 1.917
Ep 2 (Step   10): Train loss 1.791, Val loss 1.879
Ep 2 (Step   20): Train loss 1.723, Val loss 1.843
Ep 2 (Step   30): Train loss 1.

In [None]:
train_losses, val_losses, tokens_seen = train_model(gpt_medium, train_loader, val_loader, optimizer, device,
                                                    num_epochs=num_epochs, eval_freq=10, eval_iter=10)

Ep 1 (Step    0): Train loss 0.826, Val loss 1.176
Ep 1 (Step   10): Train loss 0.831, Val loss 1.176
Ep 1 (Step   20): Train loss 0.832, Val loss 1.179
Ep 1 (Step   30): Train loss 0.745, Val loss 1.179
Ep 1 (Step   40): Train loss 0.841, Val loss 1.177
Ep 1 (Step   50): Train loss 0.855, Val loss 1.174
Ep 1 (Step   60): Train loss 0.798, Val loss 1.173
Ep 1 (Step   70): Train loss 0.815, Val loss 1.173
Ep 1 (Step   80): Train loss 0.762, Val loss 1.173
Ep 1 (Step   90): Train loss 0.697, Val loss 1.173
Ep 1 (Step  100): Train loss 0.704, Val loss 1.172
Ep 1 (Step  110): Train loss 0.774, Val loss 1.168
Ep 1 (Step  120): Train loss 0.823, Val loss 1.166
Ep 1 (Step  130): Train loss 0.720, Val loss 1.166
Ep 1 (Step  140): Train loss 0.781, Val loss 1.166
Ep 1 (Step  150): Train loss 0.788, Val loss 1.166
Ep 2 (Step    0): Train loss 0.771, Val loss 1.166
Ep 2 (Step   10): Train loss 0.755, Val loss 1.168
Ep 2 (Step   20): Train loss 0.809, Val loss 1.168
Ep 2 (Step   30): Train loss 0.

In [None]:
torch.save(gpt_medium.state_dict(), "review_instruction1.pth")

In [None]:
model_state_dict = torch.load("review_instruction.pth")
gpt_medium.load_state_dict(model_state_dict)

In [None]:
start_context = format_input(train_data[0])
generate_and_print(gpt_medium, tokenizer, device, start_context)

Below is an instruction that describes a task. Write a response that appropriately completes the request.   ### Instruction: Evaluate the following phrase by transforming it into the spelling given.  ### Input: freind --> friend.  ### Response: The spelling of 'friend' is 'freind'
