### 7.2 Preparing a dataset for supervised instruction finetuning

In [1]:
import json
import os
import requests


def download_and_load_file(file_path, url):
    if not os.path.exists(file_path):
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        text_data = response.text
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    return data


# The book originally used the following code below
# However, urllib uses older protocol settings that
# can cause problems for some readers using a VPN.
# The `requests` version above is more robust
# in that regard.

"""
import urllib

def download_and_load_file(file_path, url):

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)

    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    return data
"""


file_path = "instruction-data.json"
url = (
    "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch"
    "/main/ch07/01_main-chapter-code/instruction-data.json"
)

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [2]:
print(data[50])

{'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [3]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. "
        f"Write a response that appropariately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    return instruction_text + input_text

In [4]:
print(format_input(data[50]))

Below is an instruction that describes a task. Write a response that appropariately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion


In [5]:
print(format_input(data[999]))

Below is an instruction that describes a task. Write a response that appropariately completes the request.

### Instruction:
What is an antonym of 'complicated'?


In [6]:
model_input = format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropariately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


In [7]:
train_portion = int(len(data) * 0.85)
test_portion = int(len(data) * 0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion + test_portion]
val_data = data[train_portion + test_portion:]

In [8]:
len(train_data)

935

In [9]:
len(test_data)

110

In [10]:
len(val_data)

55

### 7.3 Organizing data into training batches

In [11]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data

        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )
    def __getitem__(self, index):
        return self.encoded_texts[index]
    def __len__(self):
        return len(self.data)

In [12]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [13]:
def custom_collate_draft_1(
    batch,
    pad_token_id = 50256,
    device = "cpu"
):
    batch_max_length = max(len(item) + 1 for item in batch)

    inputs_lst = []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        inputs_lst.append(inputs)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    return inputs_tensor

In [14]:
inputs_1 = [0, 1, 2, 3, 4]
inputs_2 = [5, 6]
inputs_3 = [7, 8, 9]

batch = (
    inputs_1,
    inputs_2,
    inputs_3
)

In [15]:
custom_collate_draft_1(batch)
print(custom_collate_draft_1(batch).shape)

torch.Size([3, 5])


In [16]:
def custom_collate_draft_2(
    batch,
    pad_token_id = 50256,
    device = "cpu"
):
    batch_max_length = max(len(item) + 1 for item in batch)

    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]

        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor


In [17]:
custom_collate_draft_2(batch)

(tensor([[    0,     1,     2,     3,     4],
         [    5,     6, 50256, 50256, 50256],
         [    7,     8,     9, 50256, 50256]]),
 tensor([[    1,     2,     3,     4, 50256],
         [    6, 50256, 50256, 50256, 50256],
         [    8,     9, 50256, 50256, 50256]]))

In [18]:
def custom_collate_fn(
    batch,
    pad_token_id = 50256,
    ignore_index = -100,
    allowed_max_length = None,
    device = "cpu"
):
    batch_max_length = max(len(item) + 1 for item in batch)

    inputs_lst, targets_lst = [], []

    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = (
            new_item + [pad_token_id] * (batch_max_length - len(new_item))
        )
        inputs = torch.tensor(padded[:-1])
        targets = torch.tensor(padded[1:])
        
        mask = targets == pad_token_id
        indics = torch.nonzero(mask).squeeze()

        if indics.numel() > 1:
            targets[indics[1:]] = ignore_index
        
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]
        inputs_lst.append(inputs)
        targets_lst.append(targets)
    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

In [19]:
custom_collate_fn(batch)

(tensor([[    0,     1,     2,     3,     4],
         [    5,     6, 50256, 50256, 50256],
         [    7,     8,     9, 50256, 50256]]),
 tensor([[    1,     2,     3,     4, 50256],
         [    6, 50256,  -100,  -100,  -100],
         [    8,     9, 50256,  -100,  -100]]))

### 7.4 Creating data loaders for an instruction dataset

In [20]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    # Use PyTorch 2.9 or newer for stable mps results
    major, minor = map(int, torch.__version__.split(".")[:2])
    if (major, minor) >= (2, 9):
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
else:
    device = torch.device("cpu")

print("Device:", device)

Device: mps


In [21]:
from functools import partial

customized_collate_fn = partial(
    custom_collate_fn,
    device = device,
    allowed_max_length = 1024
)

In [22]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataset = InstructionDataset(train_data, tokenizer)
train_loader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = True,
    drop_last = True,
    num_workers = num_workers
)

In [23]:
val_dataset = InstructionDataset(val_data, tokenizer)
val_loader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    collate_fn = customized_collate_fn,
    shuffle = False,
    drop_last = False,
    num_workers = num_workers
)

In [24]:
test_dataset = InstructionDataset(test_data, tokenizer)
test_loader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    collate_fn=customized_collate_fn,
    shuffle=False,
    drop_last=False,
    num_workers=num_workers
)


In [25]:
for inputs, targets in train_loader:
    print(inputs.shape, targets.shape)

torch.Size([8, 64]) torch.Size([8, 64])
torch.Size([8, 79]) torch.Size([8, 79])
torch.Size([8, 76]) torch.Size([8, 76])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 75]) torch.Size([8, 75])
torch.Size([8, 83]) torch.Size([8, 83])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 78]) torch.Size([8, 78])
torch.Size([8, 65]) torch.Size([8, 65])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 70]) torch.Size([8, 70])
torch.Size([8, 80]) torch.Size([8, 80])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 82]) torch.Size([8, 82])
torch.Size([8, 74]) torch.Size([8, 74])
torch.Size([8, 69]) torch.Size([8, 69])
torch.Size([8, 86]) torch.Size([8, 86])
torch.Size([8, 71]) torch.Size([8, 71])
torch.Size([8, 83]) torch.Size([8, 83])
torch.Size([8, 74]) torch.Size([8, 74])
torch.Size([8, 72]) torch.Size([8, 72])
torch.Size([8, 68]) torch.Size([8, 68])
torch.Size([8, 71]) torch.Size([8, 71])


In [26]:
inputs[0]

tensor([21106,   318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,
          257,  2882,   326,  1331,  1845,    72,  1286, 32543,   262,  2581,
           13,   198,   198, 21017, 46486,    25,   198, 30003,  6525,   262,
         6827,  1262,   257,   985,   576,    13,   198,   198, 21017, 23412,
           25,   198,   464,  5156,   318,   845, 13779,    13,   198,   198,
        21017, 18261,    25,   198,   464,  5156,   318,   355, 13779,   355,
          257,  4936,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256], device='mps:0')

In [27]:
targets[0]

tensor([  318,   281, 12064,   326,  8477,   257,  4876,    13, 19430,   257,
         2882,   326,  1331,  1845,    72,  1286, 32543,   262,  2581,    13,
          198,   198, 21017, 46486,    25,   198, 30003,  6525,   262,  6827,
         1262,   257,   985,   576,    13,   198,   198, 21017, 23412,    25,
          198,   464,  5156,   318,   845, 13779,    13,   198,   198, 21017,
        18261,    25,   198,   464,  5156,   318,   355, 13779,   355,   257,
         4936,    13, 50256,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100], device='mps:0')

### 7.5 Loading a pretrained LLM

In [28]:
from gpt_download import download_and_load_gpt2
from previous_chapters import GPTModel, load_weights_into_gpt
# If the `previous_chapters.py` file is not available locally,
# you can import it from the `llms-from-scratch` PyPI package.
# For details, see: https://github.com/rasbt/LLMs-from-scratch/tree/main/pkg
# E.g.,
# from llms_from_scratch.ch04 import GPTModel
# from llms_from_scratch.ch05 import download_and_load_gpt2, load_weights_into_gpt


BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.0,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

File already exists and is up-to-date: gpt2/355M/checkpoint
File already exists and is up-to-date: gpt2/355M/encoder.json
File already exists and is up-to-date: gpt2/355M/hparams.json
File already exists and is up-to-date: gpt2/355M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/355M/model.ckpt.index
File already exists and is up-to-date: gpt2/355M/model.ckpt.meta
File already exists and is up-to-date: gpt2/355M/vocab.bpe


In [29]:
import torch

torch.manual_seed(123)
input_text = format_input(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropariately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [30]:
from previous_chapters import (
    generate,
    text_to_token_ids,
    token_ids_to_text
)

token_ids = generate(
    model = model,
    idx = text_to_token_ids(input_text, tokenizer),
    max_new_tokens = 35,
    context_size = BASE_CONFIG["context_length"],
    eos_id = 50256
)

generated_text = token_ids_to_text(token_ids, tokenizer)

In [31]:
print(generated_text)

Below is an instruction that describes a task. Write a response that appropariately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'

### Response:

The chef cooks the meal every day.

### Instruction:

Convert the active sentence to passive: 'The chef cooks the


In [32]:
response_text = (generated_text[len(input_text):].replace("### Response:", "").strip())

In [33]:
print(response_text)

The chef cooks the meal every day.

### Instruction:

Convert the active sentence to passive: 'The chef cooks the


### 7.6 Finetuning the LLM on instruction data

In [34]:
from previous_chapters import (
    calc_loss_loader,
    train_model_simple
)

In [35]:
model.to(device)

torch.manual_seed(123)

with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device, num_batchs = 5)
    val_loss = calc_loss_loader(val_loader, model, device, num_batchs = 5)

print(train_loss)
print(val_loss)

tensor(3.9338, device='mps:0')
tensor(3.8665, device='mps:0')


In [36]:
val_data[0]

{'instruction': "Convert the active sentence to passive: 'The chef cooks the meal every day.'",
 'input': '',
 'output': 'The meal is cooked by the chef every day.'}

In [37]:
device
device = torch.device("cpu")
model.to(device)
print(device)

cpu


In [None]:
import time

start_time = time.time()

torch.manual_seed(123)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 2

train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs = num_epochs, eval_freq = 5, eval_iter = 5,
    start_context = format_input(val_data[0]), tokenizer = tokenizer
)

end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"time : {exeuction_time_minutes}mins")

Ep 1 (Step 000000): Train loss 2.657, Val loss 2.642
Ep 1 (Step 000005): Train loss 1.130, Val loss 1.061
Ep 1 (Step 000010): Train loss 0.827, Val loss 0.897
Ep 1 (Step 000015): Train loss 0.811, Val loss 0.866
Ep 1 (Step 000020): Train loss 0.738, Val loss 0.837
Ep 1 (Step 000025): Train loss 0.713, Val loss 0.811
Ep 1 (Step 000030): Train loss 0.755, Val loss 0.790
Ep 1 (Step 000035): Train loss 0.682, Val loss 0.771
Ep 1 (Step 000040): Train loss 0.636, Val loss 0.763
Ep 1 (Step 000045): Train loss 0.601, Val loss 0.751
Ep 1 (Step 000050): Train loss 0.634, Val loss 0.744
Ep 1 (Step 000055): Train loss 0.723, Val loss 0.725
Ep 1 (Step 000060): Train loss 0.686, Val loss 0.707
Ep 1 (Step 000065): Train loss 0.620, Val loss 0.699
Ep 1 (Step 000070): Train loss 0.504, Val loss 0.695
Ep 1 (Step 000075): Train loss 0.540, Val loss 0.699
Ep 1 (Step 000080): Train loss 0.572, Val loss 0.692
Ep 1 (Step 000085): Train loss 0.486, Val loss 0.674


### 7.7 Extracting and saving responses

In [None]:
torch.manual_seed(123)

In [None]:
for entry in test_data[:3]:
    input_text = format_input(entry)

    token_ids = generate(
        model = model,
        idx = text_to_token_ids(input_text, tokenizer).to(device)
        max_new_tokens = 256,
        context_size = BASE_CONFIG["context_length"],
        eos_id = 50256
    )
    generated_text = token_ids_to_text(token_ids, tokenizer)
    response_text = (
        generated_text[len(input_text):]
        .replace("### Response:", "")
        .strip()
    )
    print(f"\nCorrect response:\n>> {entry['output']}")
    print(f"\nModel response:\n>> {response_text.strip()}")
    print("-------------------------------------")