## Test loading weights from openai

In [32]:
# Relative import from the gpt_download.py contained in this folder

from tools.gpt_downloader import download_and_load_gpt2

settings, params = download_and_load_gpt2(model_size="774M", models_dir="gpt2")

checkpoint: 100%|██████████| 77.0/77.0 [00:00<00:00, 76.9kiB/s]
encoder.json: 100%|██████████| 1.04M/1.04M [00:01<00:00, 725kiB/s] 
hparams.json: 100%|██████████| 91.0/91.0 [00:00<?, ?iB/s]
model.ckpt.data-00000-of-00001: 100%|██████████| 3.10G/3.10G [1:12:57<00:00, 707kiB/s]   
model.ckpt.index: 100%|██████████| 15.5k/15.5k [00:00<00:00, 15.5MiB/s]
model.ckpt.meta: 100%|██████████| 1.38M/1.38M [00:01<00:00, 762kiB/s] 
vocab.bpe: 100%|██████████| 456k/456k [00:00<00:00, 490kiB/s]  


In [3]:
print(settings)

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}


In [7]:
import torch

In [4]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False 
}

In [33]:
# Define model configurations in a dictionary for compactness
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

# Copy the base configuration and update with specific model settings
model_name = "gpt2-large (774M)"  # Example model name
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})


from classes.model import GPTModel

gpt = GPTModel(NEW_CONFIG)
gpt.eval();

In [8]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Size mismatch: {left.shape} - {right.shape}")
    
    return torch.nn.Parameter(torch.tensor(right))

In [34]:
import numpy as np

# load a convert the gpt parameters to our model class.

def load_weights_into_gpt(gpt, params):
    gpt.position_emb.weight = assign(gpt.position_emb.weight, params['wpe'])
    gpt.token_emb.weight = assign(gpt.token_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].attention.W_queries.weight = assign(
            gpt.trf_blocks[b].attention.W_queries.weight, q_w.T)
        gpt.trf_blocks[b].attention.W_keys.weight = assign(
            gpt.trf_blocks[b].attention.W_keys.weight, k_w.T)
        gpt.trf_blocks[b].attention.W_values.weight = assign(
            gpt.trf_blocks[b].attention.W_values.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].attention.W_queries.bias = assign(
            gpt.trf_blocks[b].attention.W_queries.bias, q_b)
        gpt.trf_blocks[b].attention.W_keys.bias = assign(
            gpt.trf_blocks[b].attention.W_keys.bias, k_b)
        gpt.trf_blocks[b].attention.W_values.bias = assign(
            gpt.trf_blocks[b].attention.W_values.bias, v_b)

        gpt.trf_blocks[b].attention.out_proj.weight = assign(
            gpt.trf_blocks[b].attention.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].attention.out_proj.bias = assign(
            gpt.trf_blocks[b].attention.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].fforward.layers[0].weight = assign(
            gpt.trf_blocks[b].fforward.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].fforward.layers[0].bias = assign(
            gpt.trf_blocks[b].fforward.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].fforward.layers[2].weight = assign(
            gpt.trf_blocks[b].fforward.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].fforward.layers[2].bias = assign(
            gpt.trf_blocks[b].fforward.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm_1.scale = assign(
            gpt.trf_blocks[b].norm_1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm_1.shift = assign(
            gpt.trf_blocks[b].norm_1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm_2.scale = assign(
            gpt.trf_blocks[b].norm_2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm_2.shift = assign(
            gpt.trf_blocks[b].norm_2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.output_layer.weight = assign(gpt.output_layer.weight, params["wte"])
    

device = 'cpu'
load_weights_into_gpt(gpt, params)
gpt.to(device);

In [35]:
from classes.generation import generator, TextTokenConversion
import tiktoken

tokenizer=tiktoken.get_encoding("gpt2")
gen = generator(gpt, TextTokenConversion(tokenizer))

In [38]:
torch.manual_seed(123)

tokens = gen.generate_text(gen.encoder.encode("Every effort moves you"), 50, temperature=1.5, topk=50)
print(gen.encoder.decode(tokens))

Every effort moves you toward finding an ideal life. You don't have to accept your problems by trying to hide them, because that won't happen." She then continued: "…and with a simple, clear and direct thought, no more worrying over your past problems…there
