In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F


In [2]:
device = 'cuda'


In [3]:
# read the data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
# mapping
stoi = {s:i for i,s in enumerate(chars)}
itos = {i:s for i,s in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])


In [5]:
# splitting data
data = torch.tensor(encode(text), dtype= torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
block_size = 16
batch_size = 32
eval_iters = 100

def get_batchsplit(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i : i+block_size] for i in ix])
    y = torch.stack([data[i+1 : i+block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y


In [7]:
class Linear:

    def __init__(self, _in, _out, bias=False):
        self.w = torch.randn(_in, _out) * _in ** -0.5
        self.b = torch.zeros(_out) if bias else None

    def forward(self,x):
        out = x @ self.w
        if self.b is not None:
            out += self.b
        return out

    def __call__(self,x):
        return self.forward(x)
        
    def parameters(self):
        return [self.w] + ([] if self.b is None else [self.b])
#-----------------------------------------------------------------------------

class MultiHeadAttention:

    def __init__(self, embed_dim, num_heads):
        self.num_heads = num_heads
        self.head_size = embed_dim // num_heads

        # one big projection for all QKV together
        self.qkv = Linear(embed_dim, 3* embed_dim) # for all head at once
        self.out_proj = Linear(embed_dim, embed_dim)

    def forward(self,x):
        B,T,C = x.shape

        # step 1: project and split into q,k,v
        # C = num_heads * head_size
        qkv = self.qkv(x) # (B, T, 3*C)
        qkv = qkv.reshape(B, T, 3, self.num_heads, self.head_size)

        # we separate in the axis of qkv to index them easily
        qkv = qkv.permute(2, 0, 3, 1, 4) # (3, B, n_head, T, head_size)
        q, k, v = qkv[0], qkv[1], qkv[2] # each: (B, n_heads, T, head_size)

        # step 2: compute attention score
        attn_scores = (q @ k.transpose(-1,-2)) / (self.head_size ** 0.5)
        # mask = torch.tril(torch.ones(T,T), device=attn_scores.device).unsqueeze(0).unsqueeze(0)
        mask = torch.ones(T, T, device=attn_scores.device).tril().unsqueeze(0).unsqueeze(0)


        attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))
        attn_probs = F.softmax(attn_scores, dim=-1)

        out = attn_probs @ v
        out = out.transpose(1,2).reshape(B,T,C)

        return self.out_proj(out)

    def __call__(self,x):
        return self.forward(x)

    def parameters(self):
        return [*self.qkv.parameters(), *self.out_proj.parameters()]

#-----------------------------------------------------------------------------
class LayerNorm:

    def __init__(self, dim):
        self.eps = 1e-8
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def forward(self, x):
        xmean = x.mean(-1, keepdim=True)
        xvar = x.var(-1, keepdim=True)
        return self.gamma * (x - xmean) / (torch.sqrt(xvar + self.eps)) + self.beta

    def __call__(self, x):
        return self.forward(x)

    def parameters(self):
        return [self.gamma, self.beta]

#-----------------------------------------------------------------------------

class ReLU:

    def forward(self,x):
        return torch.clamp(x,min=0.0)

    def __call__(self,x):
        return self.forward(x)

    def parameters(self):
        return []
#-----------------------------------------------------------------------------


class Sequential:

    def __init__(self, *layers):
        self.layers = layers

    def forward(self,x):
        for layer in self.layers:
            x = layer(x)
        return x

    def __call__(self,x):
        return self.forward(x)
    
    def parameters(self):
       return [p for layer in self.layers for p in layer.parameters()]

#-----------------------------------------------------------------------------

class FeedForward:

    def __init__(self, dim):
        self.net = Sequential(
            Linear(dim, 4*dim),
            ReLU(),
            Linear(4*dim, dim)
        )

    def forward(self, x):
        return self.net(x)

    def __call__(self, x):
        return self.forward(x)

    def parameters(self):
        return self.net.parameters()

#-----------------------------------------------------------------------------

class TransformerBlock:

    def __init__(self, emb_dim, num_heads):
        self.ln1 = LayerNorm(emb_dim)
        self.attn = MultiHeadAttention(emb_dim, num_heads)
        self.ln2 = LayerNorm(emb_dim)
        self.ff = FeedForward(emb_dim)

    def forward(self, x):
        # pre-norm and residual for attention
        x = x + self.attn(self.ln1(x))
        # pre-norm and residual for feedforward
        x = x + self.ff(self.ln2(x))
        return x

    def __call__(self, x):
        return self.forward(x)

    def parameters(self):
        return (
            self.ln1.parameters() + self.attn.parameters() +
            self.ln2.parameters() + self.ff.parameters()
            )

#-----------------------------------------------------------------------------

class Embedding:

    def __init__(self, num_emb, emb_dim):
        self.table = torch.randn(num_emb, emb_dim)

    def forward(self, idx):
        return self.table[idx]

    def __call__(self, idx):
        return self.forward(idx)

    def parameters(self):
        return [self.table]

#-----------------------------------------------------------------------------

class PositionalEmbedding:

    def __init__(self, max_len, emb_dim):
        self.pe = torch.randn(max_len, emb_dim)

    def forward(self, x):
        B,T = x.shape
        return self.pe[:T]

    def __call__(self, x):
        return self.forward(x)

    def parameters(self):
        return [self.pe]

#-----------------------------------------------------------------------------

class GPT:

    def __init__(self, emb_dim, vocab_size, block_size, num_heads, num_layers):
        self.block_size = block_size
        self.token_embedding = Embedding(vocab_size, emb_dim)
        self.position_embedding = PositionalEmbedding(block_size, emb_dim)
        self.blocks = [TransformerBlock(emb_dim, num_heads) for _ in range(num_layers)]
        self.ln_f = LayerNorm(emb_dim)
        self.lm_head = Linear(emb_dim, vocab_size)
    
    def forward(self, idx):
        B,T = idx.shape
        x = self.token_embedding(idx) + self.position_embedding(idx)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        return logits

    def __call__(self, idx):
        return self.forward(idx)

    def parameters(self):
        return (
            self.token_embedding.parameters() + 
            self.position_embedding.parameters() +
            [p for block in self.blocks for p in block.parameters()] +
            self.ln_f.parameters() + 
            self.lm_head.parameters()
        )
    
    def generate(self, idx, max_new_tokens):
        # idx is the array of indices in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]  
            logits = self(idx_cond)
            logits = logits[:, -1, :]  # becomes (B, C)
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx



In [8]:
class Adam:
    def __init__(self, parameters, lr=1e-3, betas=(0.9, 0.999), eps=1e-8):
        self.parameters = list(parameters)
        self.lr = lr
        self.beta1 = betas[0]
        self.beta2 = betas[1]
        self.eps = eps
        self.m = [torch.zeros_like(p) for p in self.parameters]  # First moment
        self.v = [torch.zeros_like(p) for p in self.parameters]  # Second moment
        self.t = 0  # timestep

    def step(self):
        self.t += 1
        for i, p in enumerate(self.parameters):
            if p.grad is None:
                continue
            g = p.grad
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * g
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * (g * g)
            
            # Bias correction
            m_hat = self.m[i] / (1 - self.beta1 ** self.t)
            v_hat = self.v[i] / (1 - self.beta2 ** self.t)
            
            # Update param
            p.data -= self.lr * m_hat / (v_hat.sqrt() + self.eps)

    def zero_grad(self):
        for p in self.parameters:
            if p.grad is not None:
                p.grad.zero_()


In [9]:
block_size = 512
batch_size = 32
emb_dim = 128
num_heads = 8
num_layers = 8
eval_iters = 100
eval_interval = 500
max_iters = 100



In [10]:
model = GPT(
        emb_dim=emb_dim,
        vocab_size=vocab_size, 
        num_heads=num_heads,
        block_size=block_size, 
        num_layers=num_layers,
)

for p in model.parameters():
    p.requires_grad_()
    p.data = p.data.to(device)

def count_parameters(params):
    return sum(p.numel() for p in params)

total_params = count_parameters(model.parameters())
print(total_params)

losses = []

1659392


In [20]:


optimizer = Adam(model.parameters(), lr=3e-4)

for iter in range(10000):
    # Sample a batch
    x,y = get_batchsplit('train')  # returns (x, y), with shape [B, T]

    # Forward
    logits = model(x)
    # print('y shape: ',y.shape)
    # print('logits shape: ',logits.shape)
    B,T,C = logits.shape
    logits = logits.view(B*T,C)
    y = y.view(B*T)
    loss = F.cross_entropy(logits, y)

    losses.append(loss)
    # Backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if iter % 200 == 0:
        print(f"Iter {iter}, Loss: {loss.item():.4f}")


Iter 0, Loss: 2.5068
Iter 200, Loss: 2.4745
Iter 400, Loss: 2.3945
Iter 600, Loss: 2.2893
Iter 800, Loss: 2.1912
Iter 1000, Loss: 2.0636
Iter 1200, Loss: 1.9769
Iter 1400, Loss: 1.8761
Iter 1600, Loss: 1.8469
Iter 1800, Loss: 1.7397
Iter 2000, Loss: 1.6389
Iter 2200, Loss: 1.6444
Iter 2400, Loss: 1.5863
Iter 2600, Loss: 1.5659
Iter 2800, Loss: 1.5383
Iter 3000, Loss: 1.5020
Iter 3200, Loss: 1.4986
Iter 3400, Loss: 1.4500
Iter 3600, Loss: 1.4509
Iter 3800, Loss: 1.4364
Iter 4000, Loss: 1.4685
Iter 4200, Loss: 1.4185
Iter 4400, Loss: 1.3815
Iter 4600, Loss: 1.3893
Iter 4800, Loss: 1.3916
Iter 5000, Loss: 1.3976
Iter 5200, Loss: 1.3412
Iter 5400, Loss: 1.3700
Iter 5600, Loss: 1.3192
Iter 5800, Loss: 1.3399
Iter 6000, Loss: 1.2879
Iter 6200, Loss: 1.2491
Iter 6400, Loss: 1.2903
Iter 6600, Loss: 1.2709
Iter 6800, Loss: 1.2697
Iter 7000, Loss: 1.3162
Iter 7200, Loss: 1.2563
Iter 7400, Loss: 1.2304
Iter 7600, Loss: 1.2557
Iter 7800, Loss: 1.2609
Iter 8000, Loss: 1.2458
Iter 8200, Loss: 1.2561

In [29]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))



If I detem way: but it not yet,
Though and mah outral of mile even my height.

LEONTES:
From heart and her honour more!
I' do up't: but your into would ado,
Mistress to accompy we where these fairies
Which have the usurper knew of all little not.
Seek how I heard thee use 'timonsted's, prevented
Making the patricious for a facest
Must of nice pursue of himself; if these prime in
This enough numberfullness, is dishinings till
To this billout and my noble sole out
The moves bloody of the traitor o
