In [1]:
import urllib
import os
import numpy as np
import matplotlib.pyplot as plt
import zipfile
from pathlib import Path
import pandas as pd
import json
import requests
from functools import partial

import torch
from torch.utils.data import DataLoader, Dataset

from helper import *
from knowledge_transfer import load_weights_into_gpt
from gpt_download import download_and_load_gpt2
from model import GPTModel
from tqdm import tqdm


In [2]:
link = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
file_name = "instruction-data.json"

if not os.path.exists(file_name):
    response = requests.get(link)
    text_data = response.text
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(text_data)

with open(file_name, 'r', encoding='utf-8') as file:
    data = json.load(file)

data[:5]

[{'instruction': 'Evaluate the following phrase by transforming it into the spelling given.',
  'input': 'freind --> friend',
  'output': 'The spelling of the given phrase "freind" is incorrect, the correct spelling is "friend".'},
 {'instruction': 'Edit the following sentence for grammar.',
  'input': 'He go to the park every day.',
  'output': 'He goes to the park every day.'},
 {'instruction': 'Convert 45 kilometers to meters.',
  'input': '',
  'output': '45 kilometers is 45000 meters.'},
 {'instruction': "Rewrite this sentence to start with 'Although': Despite the rain, they went for a walk.",
  'input': '',
  'output': 'Although it was raining, they went for a walk.'},
 {'instruction': 'What are the first 10 square numbers?',
  'input': '',
  'output': '1, 4, 9, 16, 25, 36, 49, 64, 81, 100.'}]

In [3]:
len(data)

1100

In [4]:
def prompt_style(dictonary: dict):


    header = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    if "instruction" in dictonary and len(dictonary["instruction"]) != 0:
        header += "\n\n### Instruction:"
        header += f"\n{dictonary['instruction']}"

    if "input" in dictonary and len(dictonary["input"]) != 0:
        header += "\n\n### Input:"
        header += f"\n{dictonary['input']}"
    # else:
    #     if header == None: header = ""
    #     if "output" in dictonary and len(dictonary["output"]) != 0:
    #         header += "### Output:"
    #         header += f"\n{dictonary['output']}"

    return header

print(prompt_style(data[999]))


Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?


In [5]:
model_input = prompt_style(data[0])
desired_output = f"\n\n### Response: \n {data[999]['output']}"

print(model_input + desired_output)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Evaluate the following phrase by transforming it into the spelling given.

### Input:
freind --> friend

### Response: 
 An antonym of 'complicated' is 'simple'.


In [6]:
train_frac = 0.85
test_frac = 0.1
val_frac = abs(train_frac - test_frac)

train_portion = int(len(data) * train_frac)
test_portion  = int(len(data) * test_frac)
val_portion   = int(len(data) * val_frac)

train_data = data[:train_portion]
test_data  = data[train_portion: train_portion + test_portion]
val_data   = data[train_portion + test_portion:]

In [7]:
len(train_data), len(test_data), len(val_data)

(935, 110, 55)

In [8]:
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.encode(model_input)[:10]

[21106, 318, 281, 12064, 326, 8477, 257, 4876, 13, 19430]

In [9]:
class Instruction_dataset(Dataset):
    def __init__(self, data: list[dict], tokenizer):
        
        self.encoded_data: list[str] = []
        for d in data:
            model_input = prompt_style(d)
            desired_output = f"\n\n### Response: \n {d['output']}"
            full_text = model_input + desired_output
        
            self.encoded_data.append(tokenizer.encode(full_text))
    def __getitem__(self, index) -> list[str]:
        return self.encoded_data[index]
    def __len__(self):
        return len(self.encoded_data)
    
ins_dataset = Instruction_dataset(data, tokenizer)
sample_data = next(iter(ins_dataset))
len(sample_data)

75

In [10]:
def collate_function(batch, padding_token_id: int = 50256, ignore_index = -100, allowed_max_length = None, device: str = "cpu"):
    max_length_of_batch = max(len(item)+1 for item in batch)

    inputs_list = []
    target_list = []
    for item in batch:
        new_item = item.copy()
        new_item += [padding_token_id]

        padded = new_item + [padding_token_id] * (max_length_of_batch - len(new_item))
        
        # remove the extra +1 we did in line 2, line 2 differs if not in noteboook
        inputs_padded = torch.tensor(padded[:-1])
        targets_padded = torch.tensor(padded[1:])

    
        mask = targets_padded == padding_token_id
        indices = torch.nonzero(mask).squeeze() # returns indexes
        if indices.numel() > 1: # more than one padding token id
            targets_padded[indices[1: ]] = ignore_index
        
        if allowed_max_length is not None:
            inputs_padded  = inputs_padded[:allowed_max_length]
            targets_padded = targets_padded[:allowed_max_length]

        inputs_list.append(inputs_padded)
        target_list.append(targets_padded)
    
    
    return torch.stack(inputs_list, dim = 0).to(device), torch.stack(target_list, dim = 0).to(device)


batch_ex = (
    [1, 2, 3, 4],
    [6, 7, 10, 3, 4, 6, 6],
    [3,]
)

input_tensor, target_tensor = collate_function(batch_ex, padding_token_id=50256, allowed_max_length=None)
input_tensor

tensor([[    1,     2,     3,     4, 50256, 50256, 50256],
        [    6,     7,    10,     3,     4,     6,     6],
        [    3, 50256, 50256, 50256, 50256, 50256, 50256]])

In [11]:
target_tensor

tensor([[    2,     3,     4, 50256,  -100,  -100,  -100],
        [    7,    10,     3,     4,     6,     6, 50256],
        [50256,  -100,  -100,  -100,  -100,  -100,  -100]])

In [12]:
[2,3,4] == [3, 5]

False

In [13]:
def get_deivce():
    if torch.cuda.is_available():
        return "cuda" 
    elif torch.mps.is_available():
        return "mps" 
    return "cpa"

get_deivce()

'mps'

In [14]:
customized_collate_function = partial(collate_function, device = get_deivce(), allowed_max_length = 1024)
customized_collate_function

functools.partial(<function collate_function at 0x114096840>, device='mps', allowed_max_length=1024)

In [15]:
batch_size = 8
num_workers = 0

train_dataset = Instruction_dataset(train_data, tokenizer)
val_dataset   = Instruction_dataset(val_data, tokenizer)
test_dataset  = Instruction_dataset(test_data, tokenizer)

train_dataloader = DataLoader(dataset     = train_dataset, 
                              batch_size  = batch_size, 
                              collate_fn  = customized_collate_function, 
                              shuffle     = True, 
                              drop_last   = True, 
                              num_workers = num_workers)

val_dataloader = DataLoader(dataset       = val_dataset, 
                              batch_size  = batch_size, 
                              collate_fn  = customized_collate_function, 
                              shuffle     = False, 
                              drop_last   = False, 
                              num_workers = num_workers)


test_dataloader = DataLoader(dataset      = test_dataset, 
                              batch_size  = batch_size, 
                              collate_fn  = customized_collate_function, 
                              shuffle     = False, 
                              drop_last   = False, 
                              num_workers = num_workers)

In [16]:
inputs, targets = next(iter(train_dataloader))
inputs.shape, targets.shape, 

(torch.Size([8, 83]), torch.Size([8, 83]))

In [17]:
print(tokenizer.decode(inputs[0].tolist()))

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Provide a synonym for 'fast'.

### Response: 
 A synonym for 'fast' is 'quick'.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [18]:
mask = targets[0] == -100
masked_target = torch.nonzero(mask)
print(
    tokenizer.decode(targets[0][:-len(masked_target)].tolist())
)

 is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Provide a synonym for 'fast'.

### Response: 
 A synonym for 'fast' is 'quick'.<|endoftext|>


In [None]:
settings, params = download_and_load_gpt2(model_size="355M", models_dir="gpt2")

GPT_CONFIG_124M = {
    "vocab_size"     : tokenizer.n_vocab, # 50257
    "context_length" : 1024,                  # The maximum number of tokens the model can process at once
    "embedding_dim"  : 768,                   # The number of features used to represent each token 
    "n_heads"        : 12,
    "n_layers"       : 12,                    # How many transformer blocks
    "drop_rate"      : 0.1,
    "qkv_bias"       : False
}

model_configs = {
    "gpt2-small (124M)": {"embedding_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"embedding_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"embedding_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"embedding_dim": 1600, "n_layers": 48, "n_heads": 25},
}

model_name = "gpt2-medium (355M)"

NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})

model = GPTModel(NEW_CONFIG)
device = get_deivce()
load_weights_into_gpt(model, params)
model.to(device)



In [20]:
model.token_embedding.weight.shape

torch.Size([50257, 1024])

In [21]:
next(model.parameters()).device

device(type='mps', index=0)

In [22]:
input_text = prompt_style(val_data[0])
print(input_text)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Convert the active sentence to passive: 'The chef cooks the meal every day.'


In [23]:
# def text_to_token_ids(text, tokenizer):
#     # return torch.tensor(tokenizer.encode(text, allowed_special="<|endoftext|>")).unsqueeze(0)

#     return torch.tensor(
#                 tokenizer.encode(
#                         text,
#                         allowed_special={"<|endoftext|>"}
#                     ), device=get_deivce()
#             ).unsqueeze(0)

In [26]:
model = model.to("cpu")

In [33]:
token_ids = generate(
            model = model,
            tokens = text_to_token_ids(input_text, tokenizer, "cpu"),
            max_new_tokens = 35,
            context_size = NEW_CONFIG["context_length"],
            eos_id = 50256,
        )

In [37]:
generated_text = token_ids_to_text(token_ids, tokenizer)
print(generated_text[len(input_text):].replace("### Response:", "").strip())

The chef is a chef. The chef cooks the meal every day.

### Instruction:

Write the response that is appropriate for the task.
