In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
import torch 
import torch.nn as nn

In [3]:
BASE_CONFIG = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "drop_rate": 0.0,       # Dropout rate
    "qkv_bias": True        # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs["gpt2-small (124M)"])

In [4]:
import os 
import urllib.request 

file_name = "gpt2-small-124M.pth"
url = f"https://huggingface.co/rasbt/gpt2-from-scratch-pytorch/resolve/main/{file_name}"

if not os.path.exists(file_name):
    urllib.request.urlretrieve(url, file_name)
    print(f"Downloaded to {file_name}")

In [5]:
from GPT.GPT_Model import GPTModel
gpt = GPTModel(BASE_CONFIG)

In [6]:
import torch

# load checkpoint
state_dict = torch.load("gpt2-small-124M.pth", map_location="cpu")

# fixing naming differences
new_state_dict = {}
for k, v in state_dict.items():
    new_k = k.replace("trf_blocks", "trf_block")  # fixing block name
    new_k = new_k.replace("ff.", "ffn.")          # fixing feedforward name
    new_state_dict[new_k] = v

# load into model
gpt.load_state_dict(new_state_dict, strict=False)
gpt.eval()


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [9]:
from GPT.Text_Generation import generate 
from GPT.Tokenization import token_to_text, text_to_tokens 
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')


device = "cuda" if torch.cuda.is_available() else "cpu"

model_input = "What is Linear Regression"

token_ids = generate(
    model=gpt.to(device), 
    idx = text_to_tokens(model_input, tokenizer).to(device), 
    max_new_tokens=100, 
    context_size=BASE_CONFIG["context_length"], 
    top_k=1, 
    temperature=1.0
)

print(f"Output text:\n {token_to_text(token_ids, tokenizer)}")

Output text:
 What is Linear Regression?

Linear regression is a technique that uses a set of data points to predict the likelihood of a given outcome. It is a method that is used to predict the likelihood of a given outcome.

The following example shows how linear regression can be used to predict the likelihood of a given outcome.

The following example shows how linear regression can be used to predict the likelihood of a given outcome.

The following example shows how linear regression can be used to predict the likelihood of
