In [1]:
import argparse
import os
import math
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from torch.nn import LayerNorm
from torch.utils.data import Dataset, DataLoader
from urllib.request import urlretrieve

print('all the nuccesary libraries and packages are imported')

all the nuccesary libraries and packages are imported


In [2]:
import urllib.request

DATA_FILE = "tiny_shakespeare.txt"
DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

print("Dataset already exists:", DATA_FILE)


Dataset already exists: tiny_shakespeare.txt


In [3]:
@dataclass
class HParams:
    block_size: int = 64
    n_layers: int = 2
    n_heads: int = 4
    d_model: int = 128
    d_ff: int = 512
    dropout: float = 0.1
    emb_dropout: float = 0.0
    lr: float = 3e-4
    weight_decay: float = 0.1
    betas: tuple = (0.9, 0.95)
    batch_size: int = 16
    max_iters: int = 2000
    eval_interval: int = 500
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
    seed: int = 42
    grad_clip: float = 1.0
    top_k: int = 40
    temperature: float = 1.0

hps = HParams()
print("Using device:", hps.device)


Using device: cpu


In [4]:
def set_seed(seed):
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

def build_vocab(text):
    chars = sorted(list(set(text)))
    stoi = {ch:i for i,ch in enumerate(chars)}
    itos = {i:ch for ch,i in stoi.items()}
    return stoi, itos

class CharDataset(Dataset):
    def __init__(self, data, block_size):
        self.data = data
        self.block_size = block_size

    def __len__(self):
        return len(self.data) - self.block_size - 1

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.block_size]
        y = self.data[idx + 1:idx + 1 + self.block_size]
        return x, y


In [5]:
set_seed(hps.seed)

with open(DATA_FILE, "r", encoding="utf-8") as f:
    text = f.read()

stoi, itos = build_vocab(text)
vocab_size = len(stoi)

data = torch.tensor([stoi[c] for c in text], dtype=torch.long)

split = int(0.9 * len(data))
train_data = data[:split]
val_data = data[split:]

train_ds = CharDataset(train_data, hps.block_size)
val_ds = CharDataset(val_data, hps.block_size)

train_loader = DataLoader(train_ds, batch_size=hps.batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=hps.batch_size)

print("Vocab size:", vocab_size)


Vocab size: 65


In [6]:
class CausalSelfAttention(nn.Module):
    def __init__(self, d_model, n_heads, block_size, dropout):
        super().__init__()
        assert d_model % n_heads == 0
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.Linear(d_model, 3 * d_model)
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

        self.register_buffer(
            "mask",
            torch.tril(torch.ones(block_size, block_size))
        )

    def forward(self, x):
        B, T, D = x.size()
        qkv = self.qkv(x)
        qkv = qkv.view(B, T, 3, self.n_heads, self.head_dim).permute(2, 0, 3, 1, 4)
        q, k, v = qkv

        att = (q @ k.transpose(-2, -1)) * self.scale
        att = att.masked_fill(self.mask[:T, :T] == 0, float("-inf"))
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)

        out = att @ v
        out = out.transpose(1, 2).contiguous().view(B, T, D)
        return self.proj(out)

class Block(nn.Module):
    def __init__(self, hps):
        super().__init__()
        self.ln1 = LayerNorm(hps.d_model)
        self.attn = CausalSelfAttention(hps.d_model, hps.n_heads, hps.block_size, hps.dropout)
        self.ln2 = LayerNorm(hps.d_model)
        self.ff = nn.Sequential(
            nn.Linear(hps.d_model, hps.d_ff),
            nn.GELU(),
            nn.Linear(hps.d_ff, hps.d_model),
            nn.Dropout(hps.dropout)
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

class TinyGPT(nn.Module):
    def __init__(self, vocab_size, hps):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, hps.d_model)
        self.pos_emb = nn.Embedding(hps.block_size, hps.d_model)
        self.blocks = nn.Sequential(*[Block(hps) for _ in range(hps.n_layers)])
        self.ln = LayerNorm(hps.d_model)
        self.head = nn.Linear(hps.d_model, vocab_size)

    def forward(self, x, targets=None):
        B, T = x.size()
        pos = torch.arange(T, device=x.device)
        x = self.tok_emb(x) + self.pos_emb(pos)
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.head(x)

        if targets is None:
            return logits

        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


In [7]:
device = torch.device(hps.device)
model = TinyGPT(vocab_size, hps).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=hps.lr)

print("Total parameters:", sum(p.numel() for p in model.parameters()))

model.train()
step = 0

for epoch in range(1, 4):
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)

        logits, loss = model(x, y)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), hps.grad_clip)
        optimizer.step()

        if step % 100 == 0:
            print(f"Epoch {epoch} | Step {step} | Loss {loss.item():.4f}")

        step += 1
        if step >= hps.max_iters:
            break
    if step >= hps.max_iters:
        break

print("Training finished ✅")


Total parameters: 421697
Epoch 1 | Step 0 | Loss 4.3895
Epoch 1 | Step 100 | Loss 2.6942
Epoch 1 | Step 200 | Loss 2.6674
Epoch 1 | Step 300 | Loss 2.5395
Epoch 1 | Step 400 | Loss 2.5116
Epoch 1 | Step 500 | Loss 2.4569
Epoch 1 | Step 600 | Loss 2.4013
Epoch 1 | Step 700 | Loss 2.4101
Epoch 1 | Step 800 | Loss 2.2895
Epoch 1 | Step 900 | Loss 2.2200
Epoch 1 | Step 1000 | Loss 2.2228
Epoch 1 | Step 1100 | Loss 2.2287
Epoch 1 | Step 1200 | Loss 2.2798
Epoch 1 | Step 1300 | Loss 2.1802
Epoch 1 | Step 1400 | Loss 2.1679
Epoch 1 | Step 1500 | Loss 2.1066
Epoch 1 | Step 1600 | Loss 2.1370
Epoch 1 | Step 1700 | Loss 2.0892
Epoch 1 | Step 1800 | Loss 2.0576
Epoch 1 | Step 1900 | Loss 2.0118
Training finished ✅
