In [1]:
!git version

git version 2.34.1


In [2]:
!git config --global user.name "Manas Agarwal"
!git config --global user.email "manasmrt10@gmail.com"

In [3]:
!git clone https://github.com/Manas2001Agarwal/LLM_from_scratch.git

Cloning into 'LLM_from_scratch'...
remote: Enumerating objects: 88, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 88 (delta 31), reused 55 (delta 10), pack-reused 0 (from 0)[K
Receiving objects: 100% (88/88), 634.98 KiB | 2.41 MiB/s, done.
Resolving deltas: 100% (31/31), done.


In [4]:
%cd /content/LLM_from_scratch/Instruction_Fine_Tuning

/content/LLM_from_scratch/Instruction_Fine_Tuning


In [5]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


##### Downloading the dataset

In [6]:
import json
import os
import urllib
import torch

In [7]:
def download_and_load_file(file_path, url):

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    return data

In [8]:
file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [9]:
data[50]

{'instruction': 'Identify the correct spelling of the following word.',
 'input': 'Ocassion',
 'output': "The correct spelling is 'Occasion.'"}

In [10]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )

    input_text = f"\n\n### Input:\n{entry['input']}" if entry['input'] else ''
    return instruction_text + input_text

In [11]:
def return_response(entry):
    response = f"\n\n### Response:\n{entry['output']}"
    return response

print(format_input(data[50]) + return_response(data[50]))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


Splitting the Dataset

In [12]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.10)

train_data = data[:train_portion]
test_data = data[train_portion:train_portion+test_portion]
validation_data = data[train_portion+test_portion:]

print(len(train_data))
print(len(test_data))
print(len(validation_data))

935
110
55


In [13]:
from torch.utils.data import Dataset,DataLoader

class InstructionDataset(Dataset):
    def __init__(self,data,tokenizer):
        self.data = data
        self.encoded_texts = []
        for entry in self.data:
            instruction_plus_input = format_input(entry)
            response = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response
            self.encoded_texts.append(tokenizer.encode(full_text))

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [14]:
def custom_collate_fn(batch,pad_token = 50256,device = "cpu",ignore_index = -100,allowed_max_length = None):
    input_lst, target_lst = [],[]
    batch_max_length = max([len(input_tokens)+1 for input_tokens in batch])
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token]

        padded = new_item + [pad_token]*(batch_max_length-len(new_item))
        input_padded = torch.tensor(padded[:-1])
        target_padded = torch.tensor(padded[1:])

        mask = (target_padded==pad_token)
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            target_padded[indices[1:]] = ignore_index

        if allowed_max_length is not None:
            target_padded = target_padded[:allowed_max_length]
            input_padded = input_padded[:allowed_max_length]

        input_lst.append(input_padded)
        target_lst.append(target_padded)

    input_lst = torch.stack(input_lst).to(device)
    target_lst = torch.stack(target_lst).to(device)

    return input_lst,target_lst


In [15]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]
batch = (
    inputs_1,
    inputs_2,
    inputs_3
)
custom_collate_fn(batch)

(tensor([[    0,     1,     2,     3,     4],
         [    5,     6, 50256, 50256, 50256],
         [    7,     8,     9, 50256, 50256]]),
 tensor([[    1,     2,     3,     4, 50256],
         [    6, 50256,  -100,  -100,  -100],
         [    8,     9, 50256,  -100,  -100]]))

In [None]:
# if torch.backends.mps.is_available():
#     device = torch.device("mps")

# print(device)

In [16]:
device = torch.device("cuda:0")

In [17]:
from functools import partial

customized_collate_func = partial(
    custom_collate_fn,
    device = device,
    allowed_max_length = 1024
)

In [18]:
from torch.utils.data import DataLoader
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

num_workers = 0
batch_size = 8
torch.manual_seed(123)

train_dataset = InstructionDataset(train_data,tokenizer)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_func,
    shuffle=True,
    drop_last=True,
    num_workers=num_workers
)
val_dataset = InstructionDataset(validation_data,tokenizer)
val_loader = DataLoader(
    dataset=val_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_func,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

test_dataset = InstructionDataset(test_data,tokenizer)
val_loader = DataLoader(
    dataset=test_dataset,
    batch_size = batch_size,
    collate_fn=customized_collate_func,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)

In [19]:
print("Validation Loader")
for input_token,target_tokens in val_loader:
    print(input_token.size(),target_tokens.size())

Validation Loader
torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 83]) torch.Size([8, 83])
torch.Size([8, 62]) torch.Size([8, 62])
torch.Size([8, 58]) torch.Size([8, 58])
torch.Size([8, 66]) torch.Size([8, 66])
torch.Size([8, 63]) torch.Size([8, 63])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 67]) torch.Size([8, 67])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 73]) torch.Size([8, 73])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 85]) torch.Size([8, 85])
torch.Size([6, 76]) torch.Size([6, 76])


Loading the Pretrained Model

In [20]:
from transformers import GPT2Model

model_names = {
    "gpt2-small (124M)": "openai-community/gpt2",
    "gpt2-medium (355M)": "openai-community/gpt2-medium",
    "gpt2-large (774M)": "openai-community/gpt2-large",
    "gpt2-xl (1558M)": "openai-community/gpt2-xl"
}

CHOOSE_MODEL = "gpt2-medium (355M)"

gpt_hf = GPT2Model.from_pretrained(model_names[CHOOSE_MODEL],cache_dir='checkpoints')
gpt_hf.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

GPT2Model(
  (wte): Embedding(50257, 1024)
  (wpe): Embedding(1024, 1024)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-23): 24 x GPT2Block(
      (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D(nf=3072, nx=1024)
        (c_proj): Conv1D(nf=1024, nx=1024)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=4096, nx=1024)
        (c_proj): Conv1D(nf=1024, nx=4096)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
)

In [21]:
BASE_CONFIG = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0,       # Dropout rate
    "qkv_bias": True        # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}


BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

In [22]:
def assign_check(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(right.clone().detach())

import numpy as np

def load_weights(gpt, gpt_hf):

    d = gpt_hf.state_dict()

    gpt.pos_emb.weight = assign_check(gpt.pos_emb.weight, d["wpe.weight"])
    gpt.tok_emb.weight = assign_check(gpt.tok_emb.weight, d["wte.weight"])

    for b in range(BASE_CONFIG["n_layers"]):
        q_w, k_w, v_w = np.split(d[f"h.{b}.attn.c_attn.weight"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign_check(gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign_check(gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign_check(gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(d[f"h.{b}.attn.c_attn.bias"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign_check(gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign_check(gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign_check(gpt.trf_blocks[b].att.W_value.bias, v_b)


        gpt.trf_blocks[b].att.out_proj.weight = assign_check(gpt.trf_blocks[b].att.out_proj.weight, d[f"h.{b}.attn.c_proj.weight"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign_check(gpt.trf_blocks[b].att.out_proj.bias, d[f"h.{b}.attn.c_proj.bias"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign_check(gpt.trf_blocks[b].ff.layers[0].weight, d[f"h.{b}.mlp.c_fc.weight"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign_check(gpt.trf_blocks[b].ff.layers[0].bias, d[f"h.{b}.mlp.c_fc.bias"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign_check(gpt.trf_blocks[b].ff.layers[2].weight, d[f"h.{b}.mlp.c_proj.weight"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign_check(gpt.trf_blocks[b].ff.layers[2].bias, d[f"h.{b}.mlp.c_proj.bias"])

        gpt.trf_blocks[b].norm1.scale = assign_check(gpt.trf_blocks[b].norm1.scale, d[f"h.{b}.ln_1.weight"])
        gpt.trf_blocks[b].norm1.shift = assign_check(gpt.trf_blocks[b].norm1.shift, d[f"h.{b}.ln_1.bias"])
        gpt.trf_blocks[b].norm2.scale = assign_check(gpt.trf_blocks[b].norm2.scale, d[f"h.{b}.ln_2.weight"])
        gpt.trf_blocks[b].norm2.shift = assign_check(gpt.trf_blocks[b].norm2.shift, d[f"h.{b}.ln_2.bias"])

        gpt.final_norm.scale = assign_check(gpt.final_norm.scale, d[f"ln_f.weight"])
        gpt.final_norm.shift = assign_check(gpt.final_norm.shift, d[f"ln_f.bias"])
        gpt.out_head.weight = assign_check(gpt.out_head.weight, d["wte.weight"])

In [23]:
%pwd
%cd ..

/content/LLM_from_scratch


In [24]:
from making_LLM_from_scratch.Scripts.gpt_archietecture import GPTModel
gpt = GPTModel(BASE_CONFIG)

load_weights(gpt, gpt_hf)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [25]:
from making_LLM_from_scratch.Scripts.training_gpt import training_gpt, calc_loss_loader
gpt.to(device)
torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader,gpt,device,num_batches=5)
    val_loss = calc_loss_loader(val_loader,gpt,device,num_batches=5)

print(train_loss)
print(val_loss)

3.825909376144409
3.915006160736084


In [27]:
PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0
import time

import torch.optim.optimizer
start_time = time.time()
torch.manual_seed(123)

device = torch.device("cuda:0")

optimizer = torch.optim.AdamW(
    gpt.parameters(),lr = 0.00005, weight_decay=0.1
)
num_epochs = 2
train_losses, val_losses, tokens_seen = training_gpt( gpt, train_loader, val_loader, optimizer,
                                                     device, num_epochs=num_epochs, eval_freq=5,
                                                     eval_iter=5,
                                                     start_context=format_input(validation_data[0]),
                                                     tokenizer=tokenizer)
end_time = time.time()
execution_time_minutes = (start_time - end_time)/60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")

Ep 1 (Step 000000): Train loss 2.624, Val loss 2.649
Ep 1 (Step 000005): Train loss 1.072, Val loss 1.088
Ep 1 (Step 000010): Train loss 0.908, Val loss 0.929
Ep 1 (Step 000015): Train loss 0.845, Val loss 0.870
Ep 1 (Step 000020): Train loss 0.797, Val loss 0.832
Ep 1 (Step 000025): Train loss 0.767, Val loss 0.815
Ep 1 (Step 000030): Train loss 0.744, Val loss 0.803
Ep 1 (Step 000035): Train loss 0.709, Val loss 0.778
Ep 1 (Step 000040): Train loss 0.692, Val loss 0.778
Ep 1 (Step 000045): Train loss 0.673, Val loss 0.770
Ep 1 (Step 000050): Train loss 0.657, Val loss 0.760
Ep 1 (Step 000055): Train loss 0.639, Val loss 0.744
Ep 1 (Step 000060): Train loss 0.625, Val loss 0.732
Ep 1 (Step 000065): Train loss 0.610, Val loss 0.731
Ep 1 (Step 000070): Train loss 0.592, Val loss 0.729
Ep 1 (Step 000075): Train loss 0.576, Val loss 0.728
Ep 1 (Step 000080): Train loss 0.565, Val loss 0.725
Ep 1 (Step 000085): Train loss 0.551, Val loss 0.714
Ep 1 (Step 000090): Train loss 0.542, Val loss

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:
from tqdm import tqdm
from making_LLM_from_scratch.Scripts.training_gpt import text_to_token_ids,token_ids_to_text,generate_simple_text
for i,entry in tqdm(enumerate(test_data),total = len(test_data)):
    instruction = format_input(entry)

    response_token_ids = generate_simple_text(gpt,text_to_token_ids(instruction,tokenizer),
                                              max_tokens = 256,
                                              content_size=1024)

    response_text = token_ids_to_text(response_token_ids,tokenizer)
    responce_text = responce_text[len(instruction):].replace("### Response","").strip()

    test_data[i]['model_response'] = response_text

    with open('instruction-data-with-response.json',"w") as file:
        json.dump(test_data, file, indent=4)