In [None]:
!pip install sympy==1.12

Collecting sympy==1.12
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.14.0
    Uninstalling sympy-1.14.0:
      Successfully uninstalled sympy-1.14.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.9.0+cu126 requires sympy>=1.13.3, but you have sympy 1.12 which is incompatible.[0m[31m
[0mSuccessfully installed sympy-1.12


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader

In [None]:
#configuration for GPT model having 124 million parameters
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256, #1024
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [None]:
"""
text
token id
token embedding
position embedding
input embedding
dropout
layer normalization
self-attention
dropout
residual connection
layer normalization
feed forward neural network
dropout
residual connection
layer normalization
output logits
"""

'\ntext\ntoken id\ntoken embedding\nposition embedding\ninput embedding\ndropout\nlayer normalization\nself-attention\ndropout\nresidual connection\nlayer normalization\nfeed forward neural network\ndropout\nresidual connection\nlayer normalization\noutput logits\n'

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
#class for multi head attention_tags
class MultiHeadAttention(torch.nn.Module):
    def __init__(self, d_in, d_out, context_length, num_heads, dropout, qkv_bias):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_head"
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.W_q = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_k = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_v = torch.nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = torch.nn.Linear(d_out, d_out)
        self.dropout = torch.nn.Dropout(dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        key = self.W_k(x)
        query = self.W_q(x)
        value = self.W_v(x)

        #reshaping key, query, value as each token with 12 heads and each head with 64 dimensions
        key = key.view(b, num_tokens, self.num_heads, self.head_dim)
        query = query.view(b, num_tokens, self.num_heads, self.head_dim)
        value = value.view(b, num_tokens, self.num_heads, self.head_dim)

        #transposing to get dimensions as batch size, each head with num_tokens and each token with head_dim (b, num_heads, num_tokens, head_dim)
        key = key.transpose(1,2)
        query = query.transpose(1,2)
        value = value.transpose(1,2)

        #calculating attention score by multiplication of query and key transpose (num_tokens, head_dim) @ (head_dim, num_tokens) = (num_tokens, num_tokens)
        attn_scores = query @ key.transpose(2,3)

        #creating mask to make the future tokens to have -inf attention score and after softmax will become zero as probability
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores = attn_scores.masked_fill_(mask_bool, -torch.inf)

        #attention score is divided by sqrt of key dimension to avoid large values and then softmax is applied to get normalize weights
        attn_weights = torch.softmax(attn_scores / key.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        #multiplying attention weights with value to get context vector (b, num_heads, num_tokens, num_tokens) * (b, num_heads, num_tokens, head_dim) = (b, num_heads, num_tokens, head_dim)
        context_vector = (attn_weights @ value).transpose(1,2)
        context_vector = context_vector.contiguous().view(b, num_tokens, d_in)
        context_vector = self.out_proj(context_vector)

        return context_vector

#class for transformer block
class  TransformerBlock(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention(d_in=cfg['emb_dim'], d_out=cfg['emb_dim'], context_length=cfg['context_length'],
                                    num_heads=cfg['n_heads'], dropout=cfg['drop_rate'], qkv_bias=cfg['qkv_bias'])
        self.ff = FeedForward(cfg)
        self.norm1 =  LayerNorm(cfg['emb_dim'])
        self.norm2 =  LayerNorm(cfg['emb_dim'])
        self.drop_shortcut = torch.nn.Dropout(cfg['drop_rate'])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x

#class for layer normalization
class  LayerNorm(torch.nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = torch.nn.Parameter(torch.ones(emb_dim))
        self.shift = torch.nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        x_norm = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * x_norm + self.shift

#class for GELU activaton function
class GELU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) *(x + 0.044715 * torch.pow(x, 3))))

#class for feed forward neural network
class FeedForward(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg['emb_dim'], 4*cfg['emb_dim']),
            GELU(),
            torch.nn.Linear(4*cfg['emb_dim'], cfg['emb_dim'])
        )

    def forward(self, x):
        return self.layers(x)

class GPTmodel(torch.nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.token_emb = torch.nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = torch.nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = torch.nn.Dropout(cfg['drop_rate'])

        self.trf_blocks = torch.nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['n_layers'])])

        self.final_norm = LayerNorm(cfg['emb_dim'])
        self.out_head = torch.nn.Linear(cfg['emb_dim'], cfg['vocab_size'], bias=False)

    def forward(self, in_idx):
        batch_size, seq_length = in_idx.shape
        token_embeds = self.token_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_length, device=in_idx.device))
        x = token_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits #shape: (batch_size, seq_length, vocab_size)
GPT_CONFIG_124M

{'vocab_size': 50257,
 'context_length': 256,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [None]:
# function to convert text to token ids and vice versa
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

In [None]:
text1 = 'Hello! I am'
tokenizer = tiktoken.get_encoding("gpt2")
input_ids=text_to_token_ids(text1, tokenizer)
print(input_ids)
input_ids.shape

tensor([[15496,     0,   314,   716]])


torch.Size([1, 4])

In [None]:
#generating text from the model
def generate_text(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        if top_k is not None: #taking top k logits and setting rest to -inf
            top_logits, _ = torch.topk(logits, top_k, dim=-1)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        if temperature>0.0: #applying temperature scaling and sampling from the distribution
            logits = logits/temperature
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1) #(batch_size, 1)
        else:
            next_token = torch.argmax(logits, dim=-1, keepdim=True) #(batch, 1)

        if next_token == eos_id: #stop generation if end of sequence token is generated
            break

        idx = torch.cat((idx, next_token), dim=1) #(batch,n_tokens+1) concatenating the new token to the existing sequence
    return idx

In [None]:
torch.manual_seed(123)
model = GPTmodel(GPT_CONFIG_124M)
model.eval() # set the model to evaluation mode , stops acting like training, dropout works as identity
result_tokens = generate_text(model=model, idx=input_ids, max_new_tokens=10, context_size=GPT_CONFIG_124M['context_length'])
print(result_tokens)
decoded_output = tokenizer.decode(result_tokens[0].tolist())
print(decoded_output)

tensor([[15496,     0,   314,   716, 13240, 11381,  4307,  7640, 33491, 12254,
         26050,  8942, 44168, 35735]])
Hello! I am Laur inhab Distrinereplacefly279 Burn issuerurnal


In [None]:
tokenizer.eot_token

50256

In [None]:
#function to generate and print text from the model
def generate_and_print(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        generated_tokens = generate_text(model, encoded, max_new_tokens=50, context_size=context_size, temperature=0.8, top_k=50, eos_id=tokenizer.eot_token)
    decoded_text = token_ids_to_text(generated_tokens, tokenizer)
    print(decoded_text.replace("\n"," "))
    model.train()

In [None]:
#will create now dataset and dataloader using bpe_tokenizer and the verdict text file
#we will use this class in dataloader to create batches of data
class GPTDatasetv1:
    def __init__(self, text, tokenizer, max_len, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_len, stride):
            input_id = token_ids[i:i+max_len]
            target_id = token_ids[i+1:i+max_len+1]
            self.input_ids.append(torch.tensor(input_id))
            self.target_ids.append(torch.tensor(target_id))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

#dataloader function to create batches of data
def create_dataloaderv1(text, batch_size=4, max_len=256, stride=128, shuffle=True, drop_last=True, num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")
    dataset1 = GPTDatasetv1(text, tokenizer, max_len, stride)

    dataloader = torch.utils.data.DataLoader(dataset1, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

In [None]:
#Dataset is now stored in variable content
# with open("/content/drive/MyDrive/Dataset/the-verdict.txt") as file:
with open("/content/drive/MyDrive/Khanakh/Dataset/the-verdict.txt") as file:
    content = file.read()
tokenizer = tiktoken.get_encoding("gpt2")
total_tokens = len(tokenizer.encode(content))
total_characters = len(content)
print(f"total tokens: {total_tokens}, total characters: {total_characters}")

total tokens: 5145, total characters: 20479


In [None]:
#training and validation dataloaders
train_ratio = 0.9
tokens = tokenizer.encode(content)
split = int(train_ratio * len(tokens))

train_tokens = tokens[:split]
val_tokens = tokens[split:]

train_data = tokenizer.decode(train_tokens)
val_data = tokenizer.decode(val_tokens)

torch.manual_seed(123)

train_dataloader = create_dataloaderv1(text=train_data, batch_size=2, max_len=GPT_CONFIG_124M['context_length'],
                                    stride=GPT_CONFIG_124M['context_length'], shuffle=True, drop_last=True, num_workers=0)

val_dataloader = create_dataloaderv1(text=val_data, batch_size=2, max_len=GPT_CONFIG_124M['context_length'],
                                    stride=GPT_CONFIG_124M['context_length'], shuffle=False, drop_last=False, num_workers=0)

In [None]:
for x,y in train_dataloader:
    print(x.shape, y.shape)
#this shows us total lines as total number of batches with each batch having 2 samples (input and target) of (2, 1024) shape which means batch size (sequences) is 2 and each sample has 1024 tokens

torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [None]:
#function to calculate loss using cross entropy loss per batch
def compute_loss(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1), target_batch.flatten())
    return loss

#calculate average loss for specified number of batches from dataloader
def calc_loss_loader(dataloader, model, device, num_batches=None):
    total_loss = 0
    if len(dataloader) == 0:
        return float('nan')
    elif num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches, len(dataloader))
        for i, (input_batch, target_batch) in enumerate(dataloader):
            if i < num_batches:
                loss = compute_loss(input_batch, target_batch, model, device)
                total_loss += loss.item()
            else:
                break
        avg_loss = total_loss / num_batches
    return avg_loss

#function to calculate training and validation loss at regular intervals(batches/eval_iter)
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

In [None]:
#training loop for llm model
def train_model(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context, tokenizer):
    #initialize list to track training and validation loss and tokens seen
    train_losses, val_losses, track_tokens_seen = [],[],[]
    tokens_seen, global_step = 0, 0

    #training loop
    for epoch in range(num_epochs):
        model.train() #set the model to training mode
        for (input_batch, target_batch) in train_loader:
            optimizer.zero_grad() #clear previous gradients
            loss = compute_loss(input_batch, target_batch, model, device)
            loss.backward() #backpropagation (calculate loss backwards)
            optimizer.step() #update model parameters
            tokens_seen += input_batch.numel() #number of tokens seen so far
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1}, step {global_step}: train_loss = {train_loss:.3f}, val_loss = {val_loss:.3f}, tokens_seen = {tokens_seen}")

        generate_and_print(model, tokenizer, device, start_context)

    return train_losses, val_losses, track_tokens_seen


In [None]:
#train the model
torch.manual_seed(123)
model = GPTmodel(GPT_CONFIG_124M).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.004, weight_decay=0.01)
num_epochs = 10

train_losses, val_losses, tokens_seen = train_model(model=model, train_loader=train_dataloader, val_loader=val_dataloader, optimizer=optimizer,
                                                    device=device, num_epochs=num_epochs, eval_freq=4, eval_iter=4,
                                                    start_context="Every effort moves you", tokenizer=tokenizer)


Epoch 1, step 2: train_loss = 8.618, val_loss = 8.164, tokens_seen = 1024
Epoch 1, step 4: train_loss = 7.466, val_loss = 7.590, tokens_seen = 2048
Epoch 1, step 6: train_loss = 8.427, val_loss = 8.799, tokens_seen = 3072
Epoch 1, step 8: train_loss = 8.495, val_loss = 8.532, tokens_seen = 4096
Every effort moves you fact not eyes would in..I it her- that And ofis.. in him he. a in not who notthe would St that it been. G G--.roud? he her".I in out- he It painting
Epoch 2, step 10: train_loss = 7.379, val_loss = 8.328, tokens_seen = 5120
Epoch 2, step 12: train_loss = 7.697, val_loss = 8.376, tokens_seen = 6144
Epoch 2, step 14: train_loss = 6.972, val_loss = 8.505, tokens_seen = 7168
Epoch 2, step 16: train_loss = 6.948, val_loss = 8.401, tokens_seen = 8192
Epoch 2, step 18: train_loss = 6.676, val_loss = 8.397, tokens_seen = 9216
Every effort moves you my my":. was he. it.. it.--burn to the as had. a. to I of with was't all-- my, it was to work I I had it it. about" I my no a a you
Ep

In [None]:
model.to(device)
torch.manual_seed(123)
generate_and_print(model, tokenizer, device, start_context='Every effort moves you')

Every effort moves you the his the,, me, theYes the  wife, of have. " it of.   " I to the    little, on,'s you.   Oh the,. Yes he- the


Saving and loading model weights

In [None]:
#saves the model weight to given path
model = GPTmodel(GPT_CONFIG_124M)
torch.save(model.state_dict(), 'gpt2_rand_weights.pth')

In [None]:
#load the model weights into new gptmodel model instance
model = GPTmodel(GPT_CONFIG_124M)
model.load_state_dict(torch.load('gpt2_rand_weights.pth'))

<All keys matched successfully>

In [None]:
#to save both model weights and optimizer parameters
model = GPTmodel(GPT_CONFIG_124M)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

torch.save({
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    },
    "model_and_optimizer.pth"
)

In [None]:
#loads both model weights and optimizer parameter for each weight
checkpoint = torch.load("model_and_optimizer.pth")
model = GPTmodel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train()

GPTmodel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_

In [None]:
total_params = sum(p.numel() for p in model.parameters())
total_params

162419712

# Loading Pretrained weights

In [None]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np

In [None]:
#load pretrained gpt2 model and tokenizer from huggingface transformers library
model_name = 'gpt2'
# tokenizer = AutoTokenizer.from_pretrained(model_name)
model_new = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
param = model_new.state_dict()
for key in param:
  print(key, "", param[key].shape)

transformer.wte.weight  torch.Size([50257, 768])
transformer.wpe.weight  torch.Size([1024, 768])
transformer.h.0.ln_1.weight  torch.Size([768])
transformer.h.0.ln_1.bias  torch.Size([768])
transformer.h.0.attn.c_attn.weight  torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias  torch.Size([2304])
transformer.h.0.attn.c_proj.weight  torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias  torch.Size([768])
transformer.h.0.ln_2.weight  torch.Size([768])
transformer.h.0.ln_2.bias  torch.Size([768])
transformer.h.0.mlp.c_fc.weight  torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias  torch.Size([3072])
transformer.h.0.mlp.c_proj.weight  torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias  torch.Size([768])
transformer.h.1.ln_1.weight  torch.Size([768])
transformer.h.1.ln_1.bias  torch.Size([768])
transformer.h.1.attn.c_attn.weight  torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias  torch.Size([2304])
transformer.h.1.attn.c_proj.weight  torch.Size([768, 768])
transformer.h.

In [None]:
#configuration for GPT model having 124 million parameters
NEW_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.2,
    "qkv_bias": True
}
gpt = GPTmodel(NEW_CONFIG)
gpt.eval();

In [None]:
model_new.transformer.h[0].mlp.c_proj.bias[:5]
len(model_new.transformer.h)
model_new.transformer.h[0].attn.c_attn.weight.shape

torch.Size([768, 2304])

In [None]:
def load_weights_into_gpt(gpt, model_new):
  with torch.no_grad():
    gpt.token_emb.weight.copy_(model_new.transformer.wte.weight)
    gpt.pos_emb.weight.copy_(model_new.transformer.wpe.weight)

    for i in range(len(model_new.transformer.h)):

      qkv_w = model_new.transformer.h[i].attn.c_attn.weight
      q_w, k_w, v_w = torch.split(qkv_w,gpt.trf_blocks[i].attn.W_q.weight.shape[0],dim=-1)
      gpt.trf_blocks[i].attn.W_q.weight.copy_(q_w.T)
      gpt.trf_blocks[i].attn.W_k.weight.copy_(k_w.T)
      gpt.trf_blocks[i].attn.W_v.weight.copy_(v_w.T)

      qkv_b = model_new.transformer.h[i].attn.c_attn.bias
      q_b, k_b, v_b = torch.split(qkv_b,gpt.trf_blocks[i].attn.W_q.bias.shape[0],dim=-1)
      gpt.trf_blocks[i].attn.W_q.bias.copy_(q_b)
      gpt.trf_blocks[i].attn.W_k.bias.copy_(k_b)
      gpt.trf_blocks[i].attn.W_v.bias.copy_(v_b)

      gpt.trf_blocks[i].attn.out_proj.weight.copy_(model_new.transformer.h[i].attn.c_proj.weight.T)
      gpt.trf_blocks[i].attn.out_proj.bias.copy_(model_new.transformer.h[i].attn.c_proj.bias)

      gpt.trf_blocks[i].ff.layers[0].weight.copy_(model_new.transformer.h[i].mlp.c_fc.weight.T)
      gpt.trf_blocks[i].ff.layers[0].bias.copy_(model_new.transformer.h[i].mlp.c_fc.bias)
      gpt.trf_blocks[i].ff.layers[2].weight.copy_(model_new.transformer.h[i].mlp.c_proj.weight.T)
      gpt.trf_blocks[i].ff.layers[2].bias.copy_(model_new.transformer.h[i].mlp.c_proj.bias)

      gpt.trf_blocks[i].norm1.scale.copy_(model_new.transformer.h[i].ln_1.weight)
      gpt.trf_blocks[i].norm1.shift.copy_(model_new.transformer.h[i].ln_1.bias)
      gpt.trf_blocks[i].norm2.scale.copy_(model_new.transformer.h[i].ln_2.weight)
      gpt.trf_blocks[i].norm2.shift.copy_(model_new.transformer.h[i].ln_2.bias)

    gpt.final_norm.scale.copy_(model_new.transformer.ln_f.weight)
    gpt.final_norm.shift.copy_(model_new.transformer.ln_f.bias)
    gpt.out_head.weight.copy_(model_new.transformer.wte.weight)


In [None]:
load_weights_into_gpt(gpt,model_new)
gpt.to(device)

GPTmodel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=True)
        (W_k): Linear(in_features=768, out_features=768, bias=True)
        (W_v): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_fe

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
torch.manual_seed(123)
generated_tokens = generate_text(gpt, text_to_token_ids("Every effort moves you", tokenizer).to(device), max_new_tokens=50,
                                context_size=NEW_CONFIG["context_length"], temperature=0.8, top_k=50, eos_id=tokenizer.eot_token)

output = token_ids_to_text(generated_tokens, tokenizer)
print(output)

Every effort moves you as far as the eye can see. (That's because you're not going to be able to see it all.) I think the most interesting thing about this is that there's an easy way to avoid it. You can get that with a couple
