In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-09-30 20:57:23--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.2’


2025-09-30 20:57:23 (36.3 MB/s) - ‘input.txt.2’ saved [1115394/1115394]



In [1]:
import os
import requests
import tiktoken
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")


Using device: cuda


In [2]:
input_file_path = 'input.txt'
with open(input_file_path, 'r', encoding='utf-8') as f:
    text = f.read()


# this is just for character-level tokentization as i only have a mac M4
# Sentencepiece is whats used commonly within the NLP community
chars = sorted(list(set(text)))
vocab_size = len(chars) # gpt2 is around 50K dimensional embeddings

# Encode  decode
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


class CharDataset(Dataset):
    def __init__(self, data: torch.Tensor, block_size: int):
        self.data = data
        self.block_size = block_size

    def __len__(self) -> int:
        return len(self.data) - self.block_size - 1

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        chunk = self.data[idx:idx + self.block_size + 1]
        return chunk[:-1].clone().to(device), chunk[1:].clone().to(device)




In [5]:

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super().__init__()
        self.embed_size = embed_size
        self.num_heads = num_heads
        self.head_dim = embed_size // num_heads

        assert (
            self.head_dim * num_heads == embed_size
        ), "Embedding size must be divisible by number of heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, x, mask=None):
        batch_size = x.shape[0]
        seq_len = x.shape[1]

        # Split the embedding into num_heads different pieces
        x = x.reshape(batch_size, seq_len, self.num_heads, self.head_dim)

        values = self.values(x)
        keys = self.keys(x)
        queries = self.queries(x)

        # Scaled dot-product attention
        attention = torch.einsum("bqhd,bkhd->bhqk", [queries, keys])
        attention = attention / (self.embed_size ** (1/2))

        if mask is not None:
            attention = attention.masked_fill(mask == 0, float("-1e20"))

        # Apply softmax
        attention = F.softmax(attention, dim=-1)

        out = torch.einsum("bhql,blhd->bqhd", [attention, values])
        out = out.reshape(batch_size, seq_len, self.embed_size)

        return self.fc_out(out)

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, num_heads, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(embed_size, num_heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size)
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attention = self.attention(x, mask)
        x = self.norm1(attention + x)
        x = self.dropout(x)

        forward = self.feed_forward(x)
        x = self.norm2(forward + x)
        x = self.dropout(x)

        return x

class SimpleTransformer(nn.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, block_size):
        super().__init__()
        self.block_size = block_size
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.pos_embedding = nn.Embedding(block_size, embed_size)

        self.layers = nn.ModuleList([
            TransformerBlock(embed_size, num_heads) for _ in range(num_layers)
        ])

        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, x, targets=None):
        batch_size, seq_len = x.shape
        positions = torch.arange(0, seq_len, device=x.device).unsqueeze(0).repeat(batch_size, 1)

        x = self.embedding(x) + self.pos_embedding(positions)

        # Create a causal mask
        mask = torch.tril(torch.ones(seq_len, seq_len)).view(1, 1, seq_len, seq_len).to(x.device)

        for layer in self.layers:
            x = layer(x, mask)

        logits = self.fc_out(x)

        if targets is None:
            return logits, None

        B, T, C = logits.shape
        logits_flat = logits.view(B*T, C)
        targets_flat = targets.view(-1)
        loss = F.cross_entropy(logits_flat, targets_flat)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        self.eval()
        with torch.no_grad():
            for _ in range(max_new_tokens):
                # Crop idx to the last block_size tokens
                idx_cond = idx[:, -self.block_size:]
                logits, _ = self(idx_cond)
                logits = logits[:, -1, :]  # Get the last time step
                probs = F.softmax(logits, dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat((idx, idx_next), dim=1)
        return idx



In [6]:
## hyperparams:
block_size= 32
batch_size = 32
embed_size=64
num_heads=4
num_layers=4
learning_rate=3e-4


tokenized_data = torch.tensor(encode(text), dtype=torch.long, device=device)
n = len(tokenized_data)
train_data = tokenized_data[:int(n*0.9)]
val_data = tokenized_data[int(n*0.9):]

train_dataset = CharDataset(train_data.cpu(), block_size = block_size)  # Keep data on CPU for DataLoader
val_dataset = CharDataset(val_data.cpu(), block_size = block_size)

# Create data loaders

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


# Create model and move to device
# Initialize model
model = SimpleTransformer(
    vocab_size=vocab_size,
    embed_size=embed_size,
    num_heads=num_heads,
    num_layers=num_layers,
    block_size=block_size
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [7]:

# Training
num_epochs = 2
print("Starting Training: ")
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        _, loss = model(x_batch, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for x_batch, y_batch in val_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            _, loss = model(x_batch, y_batch)
            val_loss += loss.item()

    print(f'Epoch {epoch+1}',
          f'Train Loss: {train_loss/len(train_loader)}',
          f'Val Loss: {val_loss/len(val_loader)}')

# Generate text
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=100)
print(decode(generated[0].cpu().tolist()))  # Move back to CPU for decoding

Starting Training: 
Epoch 1 Train Loss: 2.1601704824719414 Val Loss: 1.9464015779057398
Epoch 2 Train Loss: 1.9116269736598546 Val Loss: 1.844886560522159


AAPUTOS:
An my I foully that for breamber.
What dreign, he the but appet about jurght,
But me as of


In [9]:
# Generate text
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated = model.generate(context, max_new_tokens=2000)
print(decode(generated[0].cpu().tolist()))  # Move back to CPU for decoding


Mederperath that you hour. O milthering; I am appOn doonous his all thou me; matt away they the quatsher fay.

Fill:

SICINIO:
Fir have shall poing till Meciady at him: shop that we can voble on these imp
Had me comewe to young, brotherings that tirte
to stroubful forth un this Hengmy and the lady.

UCKINGHAM:
I
Tild sring in the have godgam!

ROMEO:
Give, to--tell. Now, sir, her fath in father at thee breath!
What to the joyint; stamor: I you wear,
For And beam, me stonsict all have cale boy longs Nay.

man RICHARD III:
as I tenter heare with did your Rickion.
Have resthy may vartue back me is your nobt,
For the reeds, my and too gon agood the geen are the host,
Not is, my sould do save: nay, that comme sheet we beford
will me this be him our all argase:
Fake teech or see riete!
Ang Pher hour strason, rothought?

First Cusizent then this have a be dive entlelt flay.

QUEEN MARGARE:
Be of mave and yearted.

For you.

SicINIUS:
For thy of thee, my no our dids;
I he seeptal cronseds yet