# Bigram Language Model
### inspired by Andrej Karpathy's tutorial & the paper "Attention Is All You Need"

In [107]:
import torch 
import numpy as np
import time
import os
device='cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [64]:
start_time=time.time()
#operations related to matrices
zeroes=torch.zeros(1,1)
end_time=time.time()
end_time-start_time

0.0

In [65]:
with open('wiz_of_oz.txt','r',encoding='utf-8') as f:
    text=f.read()

In [66]:
chars=sorted(list(set(text)))
vocab_size=len(chars)
print(''.join(chars))
vocab_size


 !"&'(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZabcdefghijklmnopqrstuvwxyz﻿


76

In [100]:
n_embed=32

In [68]:
str_to_int={ch:i for i,ch in enumerate(chars)}
int_to_str={i:ch for i,ch in enumerate(chars)}
encode= lambda s: [str_to_int[c] for c in s]
decode= lambda l: ''.join([int_to_str[i]for i in l])

print(decode(encode('hello')))

hello


In [69]:
data=torch.tensor(encode(text),dtype=torch.long)
print(data)

tensor([75, 39, 66,  ..., 69, 61, 10])


In [70]:
n= int(0.9* len(data))
train_data= data[:n]
val_data=data[n:]

In [71]:
block_size=8
train_data[:block_size+1]

tensor([75, 39, 66, 63, 52, 69, 51, 53, 52])

In [72]:
x=train_data[:block_size]
y=train_data[1:block_size+1]
for t in range(block_size):
    context=x[:t+1]
    target=y[t]
    print(f'when i/p is {context} the target is {target}')

when i/p is tensor([75]) the target is 39
when i/p is tensor([75, 39]) the target is 66
when i/p is tensor([75, 39, 66]) the target is 63
when i/p is tensor([75, 39, 66, 63]) the target is 52
when i/p is tensor([75, 39, 66, 63, 52]) the target is 69
when i/p is tensor([75, 39, 66, 63, 52, 69]) the target is 51
when i/p is tensor([75, 39, 66, 63, 52, 69, 51]) the target is 53
when i/p is tensor([75, 39, 66, 63, 52, 69, 51, 53]) the target is 52


In [73]:
torch.manual_seed(1337)
batch_size=4 #for parallel processing by gpu
block_size=8 # maximum context length
def get_batch(split):
    #generate small batch of data of inputs x and targets y
    data= train_data if split == 'train' else val_data
    ix=torch.randint(len(data)-block_size,(batch_size,))
    x= torch.stack([data[i:i+block_size]for i in ix])
    y=torch.stack([data[i+1:i+block_size+1]for i in ix])
    return x,y


xb,yb= get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size):# batch dimension
    for t in range(block_size):# time aspect
        context= xb[b, :t+1]
        target = yb[b,t]
        print(f'when input is{context.tolist()} the target: {target}')
        

inputs:
torch.Size([4, 8])
tensor([[52,  1, 56, 57, 67,  1, 56, 53],
        [ 1, 51, 49, 62,  5, 68,  1, 59],
        [51, 63, 70, 53, 66, 53, 52,  1],
        [56, 53, 66, 53,  1, 57, 67,  1]])
targets:
torch.Size([4, 8])
tensor([[ 1, 56, 57, 67,  1, 56, 53, 49],
        [51, 49, 62,  5, 68,  1, 59, 53],
        [63, 70, 53, 66, 53, 52,  1, 56],
        [53, 66, 53,  1, 57, 67,  1, 49]])
when input is[52] the target: 1
when input is[52, 1] the target: 56
when input is[52, 1, 56] the target: 57
when input is[52, 1, 56, 57] the target: 67
when input is[52, 1, 56, 57, 67] the target: 1
when input is[52, 1, 56, 57, 67, 1] the target: 56
when input is[52, 1, 56, 57, 67, 1, 56] the target: 53
when input is[52, 1, 56, 57, 67, 1, 56, 53] the target: 49
when input is[1] the target: 51
when input is[1, 51] the target: 49
when input is[1, 51, 49] the target: 62
when input is[1, 51, 49, 62] the target: 5
when input is[1, 51, 49, 62, 5] the target: 68
when input is[1, 51, 49, 62, 5, 68] the targe

In [74]:
print(xb)#i/p

tensor([[52,  1, 56, 57, 67,  1, 56, 53],
        [ 1, 51, 49, 62,  5, 68,  1, 59],
        [51, 63, 70, 53, 66, 53, 52,  1],
        [56, 53, 66, 53,  1, 57, 67,  1]])


In [112]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Each token reads off the logits for the next token from the lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        self.position_embedding_table = nn.Embedding(100, vocab_size)
        self.lm_head = nn.Linear(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):
        idx = idx.to(device).long()
        tok_emb = self.token_embedding_table(idx)  # (Batch, Time, Channel)
        B, T, C = tok_emb.shape
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))  # (Time, Channel)
        x = tok_emb + pos_emb  # (B, T, C)
        logits = self.lm_head(x)  # (Batch, Time, Vocab_size)
        
        if targets is not None:
            targets = targets.to(device).view(B * T)
            logits = logits.view(B * T, C)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        
        return logits

    def generate(self, idx, max_new_tokens):
        idx = idx.to(device).long()
        for _ in range(max_new_tokens):
            logits = self(idx)  # (B, T, C)
            logits = logits[:, -1, :]  # (B, C)
            probs = F.softmax(logits, dim=-1)  # (B, C)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)  # Append to the sequence
        return idx

# Sample input data
#vocab_size = 100
#B, T = 4, 8
#xb = torch.randint(0, vocab_size, (B, T)).to(device)
#yb = torch.randint(0, vocab_size, (B, T)).to(device)

# Initialize the model
m = BigramLanguageModel(vocab_size).to(device)

# Forward pass
logits, loss = m(xb, yb)
print(f"Logits shape: {logits.shape}")  # Should print: torch.Size([32, 100])
print(f"Loss: {loss}")  # Should print the loss value

# Generate new tokens
idx = torch.zeros(1, 1, dtype=torch.long).to(device)
generated = m.generate(idx, max_new_tokens=100)
print(f"Generated sequence: {generated}")


Logits shape: torch.Size([32, 100])
Loss: 5.012180805206299
Generated sequence: tensor([[ 0, 67,  9, 75, 73, 62, 66, 16, 49, 30,  7, 90, 26, 61, 41, 53, 61, 78,
          6, 38, 67, 68, 27, 24, 66, 90,  6, 62, 20, 58, 23,  0, 48, 10, 94, 19,
         69, 73, 88, 90, 14, 18, 55, 13, 92, 87, 10, 36, 76, 31, 60, 45, 27,  7,
          9, 72, 31, 25,  5, 90, 12, 92, 24,  9, 39, 33, 46, 57, 40, 37, 88, 69,
         77, 58, 54, 80, 26, 26, 96, 68, 55, 41, 14,  6,  4, 60,  8, 58, 71, 72,
         56,  1, 42, 36,  9,  2, 16, 20, 39, 66, 43]], device='cuda:0')


In [76]:
#training the model
#pytorch optimizer
optimizer= torch. optim.AdamW(m.parameters(), lr=1e-3)

In [77]:
batch_size=32
for steps in range (100000):
    # sample a batch of data
    xb,yb = get_batch('train')

    # eval of loss
    logits, loss= m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.4316916465759277


In [79]:
#for personal reference to check performance
print("\nfor personal reference:")
#creating a 1X1 tensor and generating 400 tokens
print(decode(m.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_tokens=400)[0].tolist()))



for personal reference:

pthent thored, onoorex uld ens t. s
the ttyeere ndrod an cees, l bimepe be ce adother anten thowad ofo o a he aree rs aithy
" ispe tereno


oue s m.

"I'mbarea prno sly or licley jonid end y, hespoct He rd id-ho drothelamerke blthe pat ings


warairerdornt ppilvan Fiss. tinser epleas.
" k rde oved tiza evererithar be u je-Buapeace

usand spempsisemplled hethe arimare be t,"Wimeeahonesthan's pls ld


## Math trick for self-attention


In [80]:
#set up a means for tokens to communicate
torch.manual_seed(1337)

B,T,C = 4,8,2 # batch time and channels
x = torch.randn(B,T,C)
x.shape


torch.Size([4, 8, 2])

In [81]:
# to get x[b,t] = avg(mean {i<=t} x[b,i])

xbow = torch.zeros((B,T,C))     #bag of words|| avg
for b in range(B):
    for t in range(T):
        xprev= x[b,:t+1] #(t,C)
        xbow[b,t]= torch.mean(xprev,0)

In [82]:
#using matrix multiplication
torch.manual_seed(1337)
a= torch.tril(torch.ones(3,3))
a= a/ torch.sum(a,1,keepdim=True)
b=torch.randint(0,10,(3,2)).float()
c=a@b
print(f"a:\n{a}\n--")
print(f"b:\n{b}\n--")
print(f"c:\n{c}\n--")

a:
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b:
tensor([[5., 7.],
        [2., 0.],
        [5., 3.]])
--
c:
tensor([[5.0000, 7.0000],
        [3.5000, 3.5000],
        [4.0000, 3.3333]])
--


In [95]:
#version 2:

wei= torch.tril(torch.ones(T,T))
wei = wei/ wei.sum(1, keepdim=True)
xbow2= wei @ x # (T,T) @ (B,T,C)---->(b,t,c)
#------>torch.allclose(xbow,xbow2)

In [96]:
#version 3: use softmax

tril=torch.tril(torch.ones(T,T))
wei= torch.zeros((T,T))
wei= wei.masked_fill(tril==0, float('inf'))
wei=F.softmax(wei,dim=-1)
xbow3=wei@x

In [122]:
# version 4: self-attention
torch.manual_seed(1337)

B,T,C=4,8,32 # batch, time and channels
x=torch.randn(B,T,C)

#a single head's performance in self attention
head_size=16
key= nn.Linear(C,head_size, bias=False)
query= nn.Linear(C,head_size, bias=False)
value= nn.Linear(C,head_size, bias=False)
k=key(x) #(B,T,16)
q=query(x) #(B,T,16)

wei= q@ k.transpose(-2,-1) #-->(B,T,16) @ (B,16,T)-->(B,T,T))


tril=torch.tril(torch.ones(T,T))
#wei=torch.zeros((T,T))
wei=wei.masked_fill(tril==0, float('-inf'))
wei=F.softmax(wei,dim=-1)
v=value(x)
out=wei@v
#out= wei@ x
out. shape

torch.Size([4, 8, 16])

In [120]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [121]:
wei

tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089

In [123]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [124]:
k.var()

tensor(1.0449)

In [125]:
q.var()

tensor(1.0700)

In [126]:
wei.var()

tensor(1.0918)

In [127]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [128]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [129]:
class LayerNorm1d: # implementing Layernorm

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) #mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalise to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [130]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [131]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))

# FINAL CODE 
## [the code you are meant to run]

In [136]:

import torch
import torch.nn as nn
from torch.nn import functional as F

# Hyperparameters
batch_size = 16
block_size = 32
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
embedding_dim = 64
num_heads = 4
num_layers = 4
dropout_rate = 0.0

# Set the random seed for reproducibility
torch.manual_seed(1337)

# Load dataset
with open('wiz_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Character mappings
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

# Prepare train and validation splits
data = torch.tensor(encode(text), dtype=torch.long)
train_size = int(0.9 * len(data))
train_data = data[:train_size]
val_data = data[train_size:]

# Batch generation function
def get_batch(split):
    data = train_data if split == 'train' else val_data
    indices = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in indices])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in indices])
    return x.to(device), y.to(device)

# Estimate loss function
@torch.no_grad()
def estimate_loss():
    model.eval()
    losses = {'train': torch.zeros(eval_iters), 'val': torch.zeros(eval_iters)}
    for split in ['train', 'val']:
        for i in range(eval_iters):
            x, y = get_batch(split)
            logits, loss = model(x, y)
            losses[split][i] = loss.item()
    model.train()
    return {split: losses[split].mean() for split in losses}

# Self-Attention Head
class SelfAttentionHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embedding_dim, head_size, bias=False)
        self.query = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)
        self.tril = torch.tril(torch.ones(block_size, block_size)).to(device)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        batch_size, time_steps, channels = x.shape
        k = self.key(x)
        q = self.query(x)
        weights = (q @ k.transpose(-2, -1)) * (channels ** -0.5)
        weights = weights.masked_fill(self.tril[:time_steps, :time_steps] == 0, float('-inf'))
        weights = F.softmax(weights, dim=-1)
        weights = self.dropout(weights)
        v = self.value(x)
        return weights @ v

# Multi-Head Self-Attention
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([SelfAttentionHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(embedding_dim, embedding_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        return self.dropout(self.proj(out))

# Feed-Forward Network
class FeedForwardNetwork(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim),
            nn.Dropout(dropout_rate)
        )

    def forward(self, x):
        return self.net(x)

# Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super().__init__()
        head_size = embedding_dim // num_heads
        self.self_attention = MultiHeadSelfAttention(num_heads, head_size)
        self.feed_forward = FeedForwardNetwork(embedding_dim)
        self.layer_norm1 = nn.LayerNorm(embedding_dim)
        self.layer_norm2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.self_attention(self.layer_norm1(x))
        x = x + self.feed_forward(self.layer_norm2(x))
        return x

# Bigram Language Model
class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dim)
        self.position_embedding_table = nn.Embedding(block_size, embedding_dim)
        self.transformer_blocks = nn.Sequential(*[TransformerBlock(embedding_dim, num_heads) for _ in range(num_layers)])
        self.layer_norm = nn.LayerNorm(embedding_dim)
        self.language_model_head = nn.Linear(embedding_dim, vocab_size)

    def forward(self, idx, targets=None):
        batch_size, time_steps = idx.shape
        token_embeddings = self.token_embedding_table(idx)
        position_embeddings = self.position_embedding_table(torch.arange(time_steps, device=device))
        x = token_embeddings + position_embeddings
        x = self.transformer_blocks(x)
        x = self.layer_norm(x)
        logits = self.language_model_head(x)

        if targets is None:
            return logits, None
        else:
            batch_size, time_steps, vocab_size = logits.shape
            logits = logits.view(batch_size * time_steps, vocab_size)
            targets = targets.view(batch_size * time_steps)
            loss = F.cross_entropy(logits, targets)
            return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

# Initialize the model and optimizer
model = BigramLanguageModel().to(device)
print(f'{sum(p.numel() for p in model.parameters()) / 1e6:.2f}M parameters')
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop
for iteration in range(max_iters):
    if iteration % eval_interval == 0 or iteration == max_iters - 1:
        losses = estimate_loss()
        print(f"Step {iteration}: Train loss {losses['train']:.4f}, Val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Generate text
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = decode(model.generate(context, max_new_tokens=2000)[0].tolist())
print(generated_text)


0.21M parameters
Step 0: Train loss 4.4973, Val loss 4.4888
Step 100: Train loss 2.6073, Val loss 2.6630
Step 200: Train loss 2.4463, Val loss 2.5023
Step 300: Train loss 2.3445, Val loss 2.3964
Step 400: Train loss 2.2577, Val loss 2.3099
Step 500: Train loss 2.1838, Val loss 2.2463
Step 600: Train loss 2.1262, Val loss 2.2024
Step 700: Train loss 2.0722, Val loss 2.1367
Step 800: Train loss 2.0028, Val loss 2.0807
Step 900: Train loss 1.9794, Val loss 2.0561
Step 1000: Train loss 1.9372, Val loss 2.0013
Step 1100: Train loss 1.8832, Val loss 1.9698
Step 1200: Train loss 1.8532, Val loss 1.9302
Step 1300: Train loss 1.8295, Val loss 1.9204
Step 1400: Train loss 1.8140, Val loss 1.8986
Step 1500: Train loss 1.7878, Val loss 1.8772
Step 1600: Train loss 1.7385, Val loss 1.8421
Step 1700: Train loss 1.7317, Val loss 1.8360
Step 1800: Train loss 1.7152, Val loss 1.8261
Step 1900: Train loss 1.6907, Val loss 1.7994
Step 2000: Train loss 1.6811, Val loss 1.7834
Step 2100: Train loss 1.6706,

## the text is gonna be nonsensical to read, don't worry