In [1]:
with open("../data/input.txt", "r") as f:
    text = f.read()

print(f"Total number of characters in the input text: {len(text)}")
print("The first 100 characters are:")
print(text[:100])

Total number of characters in the input text: 1115394
The first 100 characters are:
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [2]:
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print("Vocab size:", vocab_size)
print("Vocab:", vocab)

Vocab size: 65
Vocab: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [3]:
# Building a "tokenizer"
char_to_ind = { v:i for i,v in enumerate(vocab) }
ind_to_char = {i:v for i,v in enumerate(vocab) }

#print(char_to_ind)
#print(ind_to_char)

def encode(text: str) -> list[int]:
    return [char_to_ind[c] for c in text]

def decode(tokens: list[int]) -> str:
    return "".join([ind_to_char[i] for i in tokens])

test_phrase="Hallo World"
assert decode(encode(test_phrase)) == test_phrase

In [4]:
tokens = encode(text)

In [5]:
import torch
data = torch.tensor(tokens, dtype=torch.long)
train_data = data[:int(0.9*len(data))]
val_data = data[int(0.9*len(data)):]

assert len(data) == len(train_data) + len(val_data)

  data = torch.tensor(tokens, dtype=torch.long)


In [23]:
batch_size = 32
context_size = 8

def get_batch(split:str):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data)-batch_size,(batch_size,))
    x = torch.stack([data[i:i+context_size] for i in ix])
    y = torch.stack([data[i+1:i+context_size+1] for i in ix])
    return x,y

In [29]:
x, y = get_batch("train")
print(x.shape, y.shape)

for i in range(x.shape[1]):
    print(decode(x[0,0:i+1].tolist()) + " -> " + decode([y[0,i].item()]))

torch.Size([32, 8]) torch.Size([32, 8])
u -> n
un -> g
ung ->  
ung  -> w
ung w -> a
ung wa -> v
ung wav -> e
ung wave -> r


In [18]:
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigrammModel(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, tokens, targets=None):
        logits = self.embedding(tokens)
        loss = None
        if targets is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets=targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, tokens, max_new_tokens):

        for _ in range(max_new_tokens):
            logits, loss = self(tokens)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs,num_samples=1)
            tokens = torch.cat((tokens, next_token), dim=1)
        return tokens



In [39]:
m = BigrammModel(vocab_size)

x,y = get_batch('train')

print("x:", x.shape)
print("y:", y.shape)

y_hat, loss = m(x,y)
print("y_hat:", y_hat.shape)
print("loss:", loss)

x: torch.Size([32, 8])
y: torch.Size([32, 8])
y_hat: torch.Size([256, 65])
loss: tensor(4.8180, grad_fn=<NllLossBackward0>)


In [40]:
tokens = m.generate(torch.zeros(1,1,dtype=torch.long), 10)
print(decode(tokens.tolist()[0]))


XnWY,bNmZp


In [41]:
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [43]:
for steps in range(10000):
    x,y = get_batch("train")
    y_hat, loss = m(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 1000 == 0:
        print(f"loss at step {steps} is {loss.item()}")
print(loss.item())

loss at step 0 is 3.824690818786621
loss at step 1000 is 3.085151195526123
loss at step 2000 is 2.7088260650634766
loss at step 3000 is 2.7192509174346924
loss at step 4000 is 2.536932945251465
loss at step 5000 is 2.5350654125213623
loss at step 6000 is 2.5420680046081543
loss at step 7000 is 2.6122944355010986
loss at step 8000 is 2.5074551105499268
loss at step 9000 is 2.424015760421753
2.3745944499969482


In [45]:
print(decode(m.generate(tokens=torch.zeros([1,1],dtype=torch.long), max_new_tokens=300)[0].tolist()))



e bu r t n caondo t sers allet.

Fint,
Frest; tofo youn wans mar KI ws, t ILamanitizils CHO:
SThe Pers? mefea!
HUSO lita h ly.

Th mo gomaveld cand;
Fllou cowhou tok d Fokima hon ASOPushay t Wh, Plonole s forengme ngha r alarn yos tue s
Whatoe o:
LII Inqunoryou
in, sthermeeyorcant o,
Ore at t ld, b


In [49]:
class BigrammModel2(nn.Module):
    def __init__(self, vocab_size: int, n_embed: int):
        super().__init__()
        self.vocab_size = vocab_size
        self.n_embed = n_embed
        self.embedding = nn.Embedding(vocab_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, tokens, targets=None):
        token_emb = self.embedding(tokens)
        logits = self.lm_head(token_emb)
        loss = None
        if targets is not None:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets=targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, tokens, max_new_tokens):

        for _ in range(max_new_tokens):
            logits, loss = self(tokens)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs,num_samples=1)
            tokens = torch.cat((tokens, next_token), dim=1)
        return tokens



In [50]:
m = BigrammModel2(vocab_size, 32)
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

for steps in range(10000):
    x,y = get_batch("train")
    y_hat, loss = m(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 1000 == 0:
        print(f"loss at step {steps} is {loss.item()}")
print("Final loss:", loss.item())

print(decode(m.generate(tokens=torch.zeros([1,1],dtype=torch.long), max_new_tokens=300)[0].tolist()))

loss at step 0 is 4.3424906730651855
loss at step 1000 is 2.510611057281494
loss at step 2000 is 2.580935001373291
loss at step 3000 is 2.460279941558838
loss at step 4000 is 2.39436411857605
loss at step 5000 is 2.4070651531219482
loss at step 6000 is 2.4641172885894775
loss at step 7000 is 2.5510239601135254
loss at step 8000 is 2.4255409240722656
loss at step 9000 is 2.580385208129883
Final loss: 2.551116704940796

NAnd de, pe itithecurerang,
TES:

TRCHAn enssir hiererardar I nd Cofomputrog thiousthavim te uenoustitha all, aye t es, Sadesopesshetechehe
EY lldanveehiomamou, h ard ys,
AUSeothith d?
ULIABue brth'd hindg y
Geaifl itent bllor hin urot m.
S:
CHef t y merd inecthow, t ' ke Plf lm at, se te l's, tspll
