In [4]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [5]:
import tensorflow as tf 
import tqdm

In [6]:
from GPT.gpt_download import download_and_load_gpt2

In [7]:
settings, params = download_and_load_gpt2(model_size="355M", models_dir="../gpt2_355M")



File already exists and is up-to-date: ../gpt2_355M\355M\checkpoint




File already exists and is up-to-date: ../gpt2_355M\355M\encoder.json




File already exists and is up-to-date: ../gpt2_355M\355M\hparams.json




File already exists and is up-to-date: ../gpt2_355M\355M\model.ckpt.data-00000-of-00001




File already exists and is up-to-date: ../gpt2_355M\355M\model.ckpt.index




File already exists and is up-to-date: ../gpt2_355M\355M\model.ckpt.meta




File already exists and is up-to-date: ../gpt2_355M\355M\vocab.bpe


Both settings and params are Python dictionaries. The settings dictionary stores the LLM
architecture settings similarly to our manually defined GPT_CONFIG_124M settings. 

The
params dictionary contains the actual weight tensors. 

    
- printed the
dictionary keys because printing the weight contents would take up too much screen space

In [8]:
print(params["wte"])
print("Token embedding weight tensor dimensions:", params["wte"].shape)

[[-0.0115168   0.00311915 -0.00729894 ... -0.05262156 -0.17569277
   0.02565791]
 [-0.00861426  0.06360211 -0.01822355 ... -0.01364703 -0.12153847
   0.05352487]
 [ 0.05854857  0.06891199  0.02622696 ... -0.10057542 -0.19788682
  -0.0039184 ]
 ...
 [ 0.00162342 -0.04411932 -0.0517492  ... -0.10079621 -0.00865952
   0.02637872]
 [-0.14374605 -0.04632217 -0.00650705 ...  0.07464293 -0.04721651
  -0.03829013]
 [ 0.02065966 -0.01334631 -0.02586888 ...  0.03886637 -0.00233481
   0.00107106]]
Token embedding weight tensor dimensions: (50257, 1024)


In [9]:
from GPT.GPT_Model import GPT_CONFIG_124M, GPTModel

In [10]:
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])

In [11]:
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})
NEW_CONFIG

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': True}

In [12]:
NEW_CONFIG = {'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 1024,
 'n_heads': 16,
 'n_layers': 24,
 'drop_rate': 0.1,
 'qkv_bias': True}

In [13]:
gpt = GPTModel(NEW_CONFIG)
gpt.eval();

By default, the GPTModel instance is initialized with random weights for pretraining.

The last step to using OpenAI's model weights is to override these random weights with the weights we loaded into the params dictionary.

For this, we will first define a small assign utility function that checks whether two tensors or arrays (left and right) have the same dimensions or shape and returns the right tensor as trainable PyTorch parameters:

In [14]:
import torch 
import torch.nn as nn 

def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [15]:
import numpy as np 

def load_weights_into_gpt(model, params):
    # Embedding Layers
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    # Transformer Blocks (Loop over all layers)
    for b in range(len(params["blocks"])):
        # Attention Weights (Q, K, V)
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_block[b].att.W_query.weight = assign(
            gpt.trf_block[b].att.W_query.weight, q_w.T)
        gpt.trf_block[b].att.W_key.weight = assign(
            gpt.trf_block[b].att.W_key.weight, k_w.T)
        gpt.trf_block[b].att.W_value.weight = assign(
            gpt.trf_block[b].att.W_value.weight, v_w.T)
        
        # Attention Biases (Q, K, V)
        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_block[b].att.W_query.bias = assign(
            gpt.trf_block[b].att.W_query.bias, q_b)
        gpt.trf_block[b].att.W_key.bias = assign(
            gpt.trf_block[b].att.W_key.bias, k_b)
        gpt.trf_block[b].att.W_value.bias = assign(
            gpt.trf_block[b].att.W_value.bias, v_b)
        
        # Attention Output Projection
        gpt.trf_block[b].att.out_proj.weight = assign(
            gpt.trf_block[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_block[b].att.out_proj.bias = assign(
            gpt.trf_block[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])
        
        # Feedforward (MLP) Layers
        gpt.trf_block[b].ffn.layers[0].weight = assign(
            gpt.trf_block[b].ffn.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_block[b].ffn.layers[0].bias = assign(
            gpt.trf_block[b].ffn.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_block[b].ffn.layers[2].weight = assign(
            gpt.trf_block[b].ffn.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_block[b].ffn.layers[2].bias = assign(
            gpt.trf_block[b].ffn.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])
        
        # LayerNorm Parameters
        gpt.trf_block[b].norm1.scale = assign(
            gpt.trf_block[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_block[b].norm1.shift = assign(
            gpt.trf_block[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_block[b].norm2.scale = assign(
            gpt.trf_block[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_block[b].norm2.shift = assign(
            gpt.trf_block[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])
    
    # Final LayerNorm and Output Head
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

matching the weights from OpenAI's implementation with our GPTModel implementation.

In [16]:
load_weights_into_gpt(gpt, params)

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
gpt.to(device=device);

In [18]:
device

'cpu'

In [19]:
torch.manual_seed(123)
from GPT.Text_Generation import generate 
from GPT.Tokenization import text_to_tokens, token_to_text
import tiktoken 
tokenizer = tiktoken.get_encoding('gpt2')

text = "Computer"

token_ids = generate(
    model=gpt, 
    idx = text_to_tokens(text, tokenizer = tokenizer).to(device), 
    max_new_tokens=50, 
    context_size=NEW_CONFIG["context_length"], 
    top_k=50, 
    temperature=1.5
)

In [20]:
print(f"Output_text: {token_to_text(token_ids, tokenizer)}")

Output_text: Computer" will create a new local copy of SQL Server 2010 or later using Active Directory Users and Computers permissions to create these credentials:

<CS01> <PSC1>: Set <SYSTEMLOGINID> <SUBCONSE
