In [2]:
import torch

##### GPT-2 - 124M

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"

model = AutoModelForCausalLM.from_pretrained(model_name)

In [11]:
for k,v in model.state_dict().items():
    print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [22]:
sum(p.numel() for p in model.parameters())


124439808

In [39]:
a = torch.randn(3,4)

a.view(-1)

tensor([ 0.4579,  0.1656, -0.7620,  0.3053,  0.0652, -0.1108,  0.9210,  0.4327,
        -0.0926,  0.6806, -1.2805,  0.3362])

In [18]:
from transformers import pipeline, set_seed

pipe = pipeline("text-generation", model=model_name)

Device set to use mps:0


In [24]:
import numpy as np

set_seed(42)
pipe("Hello Boss! Welcome", max_new_tokens=30, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello Boss! Welcome to my new account, Boss. I'll be your new best friend, your new best buddy. I'll be your new best friend, your new"},
 {'generated_text': 'Hello Boss! Welcome to the second episode of our episode of the "Bravo Guide to the Superheroes" series, and it\'s a very special episode.'},
 {'generated_text': 'Hello Boss! Welcome to the Dungeon! Welcome to the World! Welcome to the Wolfpack! Welcome to the Underdog Bounty Hunter Welcome to the Underdog Bounty Hunter 2'},
 {'generated_text': "Hello Boss! Welcome to the new book, The Magician's Apprentice: The True Story of the Mystic Thief. It gives you a comprehensive look at the development of the"},
 {'generated_text': "Hello Boss! Welcome to Smash Mouth.\n\nYou can follow Smash Mouth on Twitter at @Smash_Mew, or if you're not sure of where to"}]

In [126]:
from torch import nn
from dataclasses import dataclass
import torch.nn.functional as F

@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768


class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.head_dim = config.n_embd // config.n_head
        
        # Key, Query, Value projections combined
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # Output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        
        # Causal mask
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size)).view(
                1, 1, config.block_size, config.block_size
            )
        )

    def forward(self, x):
        B, T, C = x.size()  # batch, sequence length, embedding dim
        
        # Calculate Q, K, V
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        
        # Reshape for multi-head attention
        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, nh, T, hd)
        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        
        # Attention scores
        att = (q @ k.transpose(-2, -1)) * (1.0 / (self.head_dim ** 0.5))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        
        # Apply attention to values
        y = att @ v  # (B, nh, T, hd)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        
        # Output projection
        y = self.c_proj(y)
        return y


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.act = nn.GELU(approximate="tanh")

    def forward(self, x):
        x = self.c_fc(x)
        x = self.act(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd)
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

    def forward(self, idx):
        B, T = idx.size()
        assert T <= self.config.block_size, f"Sequence length {T} exceeds block size {self.config.block_size}"
        
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device).unsqueeze(0)
        
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = tok_emb + pos_emb
        
        for block in self.transformer.h:
            x = block(x)
        
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        return logits

    @classmethod
    def from_pretrained(cls, model_name):
        """Load pretrained GPT-2 weights from HuggingFace"""
        from transformers import GPT2LMHeadModel
        
        print(f"Loading weights from pretrained GPT-2: {model_name}")
        model_hf = GPT2LMHeadModel.from_pretrained(model_name)
        
        config = GPTConfig(
            block_size=model_hf.config.n_positions,
            vocab_size=model_hf.config.vocab_size,
            n_layer=model_hf.config.n_layer,
            n_head=model_hf.config.n_head,
            n_embd=model_hf.config.n_embd
        )
        
        gpt = cls(config)
        sd = gpt.state_dict()
        sd_hf = model_hf.state_dict()
        
        # Map HuggingFace keys to our keys (they use Conv1D, we use Linear)
        sd_keys_hf = [k for k in sd_hf.keys() if not k.endswith('.attn.masked_bias') and not k.endswith('.attn.bias')]
        
        # HuggingFace uses Conv1D which stores weights transposed
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # Transpose Conv1D weights to Linear weights
                assert sd_hf[k].shape[::-1] == sd[k].shape, f"Shape mismatch for {k}"
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # Direct copy
                assert sd_hf[k].shape == sd[k].shape, f"Shape mismatch for {k}"
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
        
        print(f"Loaded {len(sd_keys_hf)} weight tensors")
        return gpt

In [127]:
# Load the model with properly mapped weights
mymodel = GPT.from_pretrained("gpt2")





Loading weights from pretrained GPT-2: gpt2
Loaded 149 weight tensors


In [133]:
def generate(model, prompt, max_new_tokens=50, temperature=1.0, top_k=None):
    """
    Generate text using the GPT model.
    """
    model.eval()
    tokenizer = tiktoken.get_encoding("gpt2")
    tokens = tokenizer.encode(prompt)
    input_ids = torch.tensor([tokens])
    
    device = next(model.parameters()).device
    input_ids = input_ids.to(device)
    
    with torch.no_grad():
        for _ in range(max_new_tokens):
            # Crop input if it exceeds block size
            idx_cond = input_ids if input_ids.size(1) <= model.config.block_size else input_ids[:, -model.config.block_size:]
            
            # Forward pass
            logits = model(idx_cond)
            
            # Get logits for the last token
            logits = logits[:, -1, :] / temperature
            
            # Optional top-k filtering
            if top_k is not None:
                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
                logits[logits < v[:, [-1]]] = float('-inf')
            
            # Convert to probabilities
            probs = torch.softmax(logits, dim=-1)
            
            # Sample next token
            next_token = torch.multinomial(probs, num_samples=1)
            
            # Append to sequence
            input_ids = torch.cat([input_ids, next_token], dim=1)
    
    # Decode and return
    generated_tokens = input_ids[0].tolist()
    return tokenizer.decode(generated_tokens)


# Example usage - now with properly loaded weights!
prompt = "Hello Boss! Welcome"
generated_text = generate(mymodel, prompt, max_new_tokens=30)
print(f"Prompt: {prompt}")
print(f"Generated: {generated_text}")

Prompt: Hello Boss! Welcome
Generated: Hello Boss! Welcome Frenker FTL: Faster Than Light Fuego! Fugl Full Bore Full Metal Furies Full Mojo Rampage Full Throttle Remastered Full
