# Fine tuning Instructions

In [1]:
import json
import os
import urllib

def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        print(f"Downloading file from {url}...")
        urllib.request.urlretrieve(url, file_path)
        print("Download complete.")
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

file_path = 'instruction-data.json'
url = (
    'https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json'
)
data = download_and_load_file(file_path, url)
print("Number of records loaded:", len(data))

print("Sample record:", data[0])


Number of records loaded: 1100
Sample record: {'instruction': 'Evaluate the following phrase by transforming it into the spelling given.', 'input': 'freind --> friend', 'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'}


In [2]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry['input'].strip() != "" else ""
    )
    return instruction_text + input_text

model_input = format_input(data[0])
desired_response = f"\n\n### Response:\n{data[0]['output']}"
print(model_input+desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Evaluate the following phrase by transforming it into the spelling given.

### Input:
freind --> friend

### Response:
The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".


In [3]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

print("Train data size:", len(train_data))
print("Test data size:", len(test_data))
print("Validation data size:", len(val_data))

import torch
from torch.utils.data import Dataset, DataLoader

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        self.encoded_texts = []
        for entry in data:
            model_input = format_input(entry)
            desired_response = f"\n\n### Response:\n{entry['output']}"
            full_text = model_input + desired_response
            encoded = tokenizer.encode(full_text)
            self.encoded_texts.append(torch.tensor(encoded, dtype=torch.long))
    
    def __getitem__(self, idx):
        return self.encoded_texts[idx]
    
    def __len__(self):
        return len(self.encoded_texts)
    
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))## Should output: [50256]

def custom_collate_draft_1(batch, pad_token_id=50256, device='cpu'):
    batch_sizes = [len(item)+1 for item in batch]
    max_length = max(batch_sizes)
    
    input_lst = []
    for item in batch:
        new_item = item.tolist()
        new_item += [pad_token_id]

        padded = (new_item + [pad_token_id] * (max_length - len(new_item)))
        inputs = torch.tensor(padded[:-1])
        input_lst.append(inputs)

    inputs_tensor = torch.stack(input_lst).to(device)
    return inputs_tensor

inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6, 7]
inputs_3 = [8, 9, 10, 11]
batch = [torch.tensor(inputs_1), torch.tensor(inputs_2), torch.tensor(inputs_3)]
padded_batch = custom_collate_draft_1(batch, device='cpu')
print(padded_batch)

Train data size: 935
Test data size: 110
Validation data size: 55
[50256]
tensor([[    0,     1,     2,     3,     4],
        [    5,     6,     7, 50256, 50256],
        [    8,     9,    10,    11, 50256]])


In [4]:
def custom_collate_fn(batch, pad_token_id=50256, ignore_idx=-100, allowed_max_length=None, device='cpu'):
    batch_sizes = [len(item) for item in batch]
    max_length = max(batch_sizes)
    input_lst = []
    target_lst = []
    for item in batch:
        new_item = item.tolist()
        if allowed_max_length is not None:
            new_item = new_item[:allowed_max_length]
        padded = new_item + [pad_token_id] * (max_length - len(new_item))
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        mask = targets == pad_token_id
        indices = torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_idx
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        input_lst.append(inputs)
        target_lst.append(targets)
    inputs_tensor = torch.stack(input_lst).to(device)
    targets_tensor = torch.stack(target_lst).to(device)
    return inputs_tensor, targets_tensor

inputs, targets = custom_collate_fn(batch, device='cpu')
print("Inputs:\n", inputs)
print("Targets:\n", targets)

# logits_1 = torch.tensor(
#     [[-1.0, 1.0],
#     [-0.5, 0.5]]
# )
# targets_1 = torch.tensor(
#     [0, 1]
# )
# loss_fn = torch.nn.CrossEntropyLoss()
# loss_1 = loss_fn(logits_1, targets_1)
# print("Loss 1:", loss_1.item())  # Should output a scalar value

Inputs:
 tensor([[    0,     1,     2,     3],
        [    5,     6,     7, 50256],
        [    8,     9,    10,    11]])
Targets:
 tensor([[    1,     2,     3,     4],
        [    6,     7, 50256,  -100],
        [    9,    10,    11, 50256]])


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

from functools import partial
customized_collate_fn = partial(
    custom_collate_fn,
    pad_token_id=50256,
    ignore_idx=-100,
    allowed_max_length=1024,
    device=device
)

from torch.utils.data import DataLoader
num_workers = 4 if torch.cuda.is_available() else 0
batch_size = 8

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=customized_collate_fn,
    num_workers=num_workers,
    drop_last=True  ## drop last means to drop the last incomplete batch
)

test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=customized_collate_fn,
    num_workers=num_workers,
    drop_last=False
)

val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=customized_collate_fn,
    num_workers=num_workers,
    drop_last=False
)

# print("Train loader:")
# for inputs, targets in train_loader:
#     print(inputs.shape, targets.shape)

Using device: cuda


In [None]:
## load pretrained model 
from gpt_download import download_and_load_gpt2
from functions import GPTModel, load_weights_into_gpt

BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True,
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split('(')[-1].strip(')')
settings, params = download_and_load_gpt2(model_size=model_size, models_dir='./gpt2')

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.to(device)
model.eval()

torch.manual_seed(123)
input_text = format_input(val_data[0])
print(input_text)

from functions import generate, text_to_token_ids, token_ids_to_text
input_ids = text_to_token_ids(input_text, tokenizer).to(device)
generated_ids = generate(
    model,
    input_ids,
    max_new_tokens=35,
    context_size=BASE_CONFIG["context_length"],
    eos_idx=50256
)
generated_text = token_ids_to_text(generated_ids, tokenizer)
response_text = generated_text[len(input_text):].strip()
print("Generated Response:\n", response_text)

2025-12-01 23:44:57.354802: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-01 23:44:57.379507: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


File already exists and is up-to-date: ./gpt2/355M/checkpoint


encoder.json: 100%|██████████| 1.04M/1.04M [00:25<00:00, 40.4kiB/s]
hparams.json: 100%|██████████| 91.0/91.0 [00:00<00:00, 124kiB/s]
model.ckpt.data-00000-of-00001:   1%|          | 10.6M/1.42G [16:38<32:31:50, 12.0kiB/s]

In [None]:
import time
from functions import train_model_simple

start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)

num_epochs = 2
train_loss, val_loss, tokens_seen = train_model_simple(model, train_loader, val_loader, optimizer, device, 
num_epochs=num_epochs, eval_freq=5, eval_iter=5, start_context=format_input(val_data[0]), tokenizer=tokenizer
)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

from functions import plot_values
epochs_tensor = torch.linspace(1, num_epochs, steps=len(train_loss))
plot_values(epochs_tensor, train_loss, val_loss, "Training and Validation Loss over Epochs", "Epochs", "Loss")

In [None]:
torch.manual_seed(123)

for entry in test_data[:3]:
    input_text = format_input(entry)
    print("Input:\n", input_text)
    input_ids = text_to_token_ids(input_text, tokenizer).to(device)
    generated_ids = generate(
        model,
        input_ids,
        max_new_tokens=256,
        context_size=BASE_CONFIG["context_length"],
        eos_idx=50256
    )
    generated_text = token_ids_to_text(generated_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip
    )
    print(input_text)
    print(f"Correct Response:\n{entry['output']}\n")
    print(f"Generated Response:\n{response_text.strip()}\n")
    print("--------------------------------------------\n")