In [12]:
from transformers import AutoTokenizer
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
import math
import random
import torch.optim as optim



In [13]:
# gpt paper recreation with attention is all you need paper basics


tokenizer = AutoTokenizer.from_pretrained("openai-community/openai-gpt")
tokenized = tokenizer.encode("California is a state that is part of the United States of America.", return_tensors="pt")

tokenized



tensor([[6983,  544,  246, 2720,  525,  544, 1250,  498,  481, 6373, 5487,  498,
         6056,  239]])

In [21]:
# embeddings
class EmbeddingModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim, max_seq_len):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
    self.position_embedding = nn.Embedding(max_seq_len, embedding_dim)
  def forward(self, input_ids):
    # B = batches of tokens
    # T = number of tokens in batch
   B, T = input_ids.shape

   device = input_ids.device
   positions = torch.arange(0, T, device=device).unsqueeze(0)  # (1, T) sets the batch number to one

   tok = self.token_embedding(input_ids)   # (B, T, dim_model) where #dim_model is the vector size of the token
   pos = self.position_embedding(positions)   # (1, T, dim_model) #T is the number of tokens as mentiuned before

   return (tok + pos)     # (B, T, dim_model) i believe this is type broadcasting



In [15]:

class MultiHeadAttention(nn.Module):
    def __init__(self, model_dim, num_heads):
        super().__init__()
        assert model_dim % num_heads == 0, "model_dim must be divisible by num_heads"

        self.num_heads = num_heads
        self.dim_k = model_dim // num_heads

        # linear projections for Q, K, V
        self.q_proj = nn.Linear(model_dim, model_dim)
        self.k_proj = nn.Linear(model_dim, model_dim)
        self.v_proj = nn.Linear(model_dim, model_dim)

        # final linear layer to recombine heads
        self.out_proj = nn.Linear(model_dim, model_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, model_dim = x.size()

        # projecting inputs to Q, K, V
        Q = self.q_proj(x)  # (batch, seq_len, model_dim)
        K = self.k_proj(x)
        V = self.v_proj(x)

        #reshape to (batch, num_heads, seq_len, dim_k) to split into different heads
        #so first we have allbatches in parralled within those batches we have all
        #heads in parallel and each head has size row x column of num_token by dim of vector

        Q = Q.view(batch_size, seq_len, self.num_heads, self.dim_k).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.dim_k).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.dim_k).transpose(1, 2)

        #compute scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.dim_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        attn = F.softmax(scores, dim=-1)
        context = torch.matmul(attn, V)  # (batch, heads, seq_len, dim_k)

        # concatenate heads and project
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, model_dim)
        output = self.out_proj(context)  #(batch, seq_len, model_dim)

        return output


In [16]:
class MLP(nn.Module):
  def __init__(self, model_dim, hidden_layer=2048):
    super().__init__()
    self.l1 = nn.Linear(model_dim, hidden_layer)
    self.l2 = nn.Linear(hidden_layer, model_dim)
    self.relu = nn.ReLU()

  def forward(self, x):
    x = self.l1(x)
    x = self.relu(x)
    x = self.l2(x)

    return x


In [17]:
class TransformerBlock(nn.Module):
    def __init__(self, model_dim, num_heads, mlp_hidden_dim=2048):
        super().__init__()

        #multi head attention
        self.mha = MultiHeadAttention(model_dim, num_heads)

        #layer norms
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)

        #mlp layer
        self.mlp = MLP(model_dim, hidden_layer=mlp_hidden_dim)

    def forward(self, x, mask=None):
        #multi-head attention with residual + layernorm
        attn_out = self.mha(x, mask=mask)
        x = self.norm1(x + attn_out)

        #feedforward  residual + layernorm
        mlp_out = self.mlp(x)
        x = self.norm2(x + mlp_out)

        return x


In [18]:


class GPT(nn.Module):
    def __init__(self, vocab_size, model_dim, max_tokens, num_heads, num_layers, mlp_hidden_dim=None):
        super().__init__()

        if mlp_hidden_dim is None:
            mlp_hidden_dim = 4 * model_dim  # GPT default

        #embedding
        self.embedding = EmbeddingModel(vocab_size, model_dim, max_tokens)

        #transformer block stack
        self.blocks = nn.ModuleList([
            TransformerBlock(model_dim, num_heads, mlp_hidden_dim)
            for _ in range(num_layers)
        ])

        #final linear layer to project to vocab size
        self.lm_head = nn.Linear(model_dim, vocab_size, bias=False)

    def forward(self, input_ids):
        B, T = input_ids.shape

        #fmbedding
        x = self.embedding(input_ids)  # (B, T, model_dim)

        #causal mask
        mask = torch.tril(torch.ones(T, T, device=input_ids.device)).unsqueeze(0).unsqueeze(0)
        # shape: (1, 1, T, T), broadcast over batch and heads

        #pass through transformer blocks
        for block in self.blocks:
            x = block(x, mask=mask)

        #project to vocab logits
        logits = self.lm_head(x)  # (B, T, vocab_size)

        return logits


In [22]:
vocab_size = len(tokenizer)
model_dim = 128 #768
max_tokens = 512 #70000
num_heads = 4 #12
num_layers = 2 #12
max_seq_len = 20
gpt = GPT(vocab_size, model_dim, max_tokens, num_heads, num_layers)

input_ids = torch.randint(0, vocab_size, (2, 20))  # batch=2, seq_len=20
logits = gpt(input_ids)
print(logits.shape)  # (2, 20, 40000 smth)


torch.Size([2, 20, 40478])


In [23]:
batch_size = 2
epochs = 5


data = tokenizer.encode(open("names.txt").read())
data = torch.tensor(data, dtype=torch.long)

#split into sequences of length seq_len+1
seq_len = max_seq_len
examples = []
for i in range(0, len(data) - seq_len):
    chunk = data[i:i + seq_len + 1]
    examples.append(chunk)

device = "cuda" if torch.cuda.is_available() else "cpu"
gpt.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(gpt.parameters(), lr=1e-4)



for epoch in range(epochs):
    random.shuffle(examples)  # shuffle sequences each epoch
    for i in range(0, len(examples), batch_size):
        batch = examples[i:i+batch_size]
        batch = torch.stack(batch).to(device)  # (B, seq_len+1)

        input_ids = batch[:, :-1]   # (B, seq_len)
        target_ids = batch[:, 1:]   # (B, seq_len)

        optimizer.zero_grad()
        logits = gpt(input_ids)     # (B, seq_len, vocab_size)

        loss = criterion(logits.reshape(-1, vocab_size), target_ids.reshape(-1))
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} | Loss: {loss.item():.4f}")


Token indices sequence length is longer than the specified maximum sequence length for this model (69028 > 512). Running this sequence through the model will result in indexing errors


Epoch 1 | Loss: 4.0593
Epoch 2 | Loss: 2.5089
Epoch 3 | Loss: 1.6375
Epoch 4 | Loss: 0.5798
Epoch 5 | Loss: 0.9789
Epoch 6 | Loss: 0.4549
Epoch 7 | Loss: 0.6494
Epoch 8 | Loss: 0.5760
Epoch 9 | Loss: 0.4415
Epoch 10 | Loss: 0.5760


In [25]:
torch.save(gpt.state_dict(), "gpt_weights.pth")

In [26]:
torch.save(gpt, "gpt_full_model.pth")


In [29]:


@torch.no_grad()
def generate_text(model, tokenizer, input_text, max_new_tokens=50, temperature=1.0, top_k=None):

    model.eval()


    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    input_ids = input_ids.to(next(model.parameters()).device)

    for _ in range(max_new_tokens):

        logits = model(input_ids)               # (1, seq_len, vocab_size)
        logits = logits[:, -1, :] / temperature # focus on last token


        if top_k is not None:
            top_vals, top_idx = torch.topk(logits, top_k)
            probs = torch.zeros_like(logits).scatter_(1, top_idx, F.softmax(top_vals, dim=-1))
        else:
            probs = F.softmax(logits, dim=-1)


        next_token = torch.multinomial(probs, num_samples=1)  # (1, 1)


        input_ids = torch.cat([input_ids, next_token], dim=1)

    output_text = tokenizer.decode(input_ids[0].tolist())
    return output_text


In [28]:
generate_text(gpt, tokenizer, "nandita")

'nandita layani lahera lorkallin meefa zjadalyn jailay jaiyon talaai jakia aeijazmarie jazanna jadee jazlett jerneoma jren a jalimhir jamere jaela jaide jino ja'