In [1]:
with open ('input.txt') as f:
    text = f.read()

In [None]:
# Word Embedding
vocab_size = len(set(text))
string_to_integers = {char:i for i,char in enumerate(set(text))}
integers_to_string = {i:char for i,char in enumerate(set(text))}

encode = lambda x: [string_to_integers[char] for char in x]
decode = lambda x: ''.join([integers_to_string[i] for i in x])

[37, 42, 55, 55, 54]
Hello


In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

data = torch.tensor(encode(text))
data

tensor([10, 12, 57,  ...,  2, 17, 16])

In [18]:
split = int(0.9 * len(data))
train_data, val_data = data[:split], data[split:]

In [20]:
batch_size = 4
block_size = 8
def get_batch(split):
    data = train_data if split == "train" else val_data
    idx = torch.randint(len(data)-block_size, (batch_size,))

    x = torch.stack([data[i : i+block_size] for i in idx])
    y = torch.stack([data[i+1 : i+block_size+1] for i in idx])
    return x, y

xb,yb = get_batch("train")
xb, yb


(tensor([[ 1, 42, 55, 45, 39, 16,  4, 42],
         [15, 40, 50,  3, 15, 53, 58, 40],
         [40, 50, 15, 50, 12,  1, 15, 52],
         [50, 58, 55, 55, 15, 53, 42, 15]]),
 tensor([[42, 55, 45, 39, 16,  4, 42, 40],
         [40, 50,  3, 15, 53, 58, 40, 40],
         [50, 15, 50, 12,  1, 15, 52, 50],
         [58, 55, 55, 15, 53, 42, 15, 54]]))

In [57]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None): 
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
        
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print("Logits -->", logits.shape)
print("Loss -->", loss.data)


idx = torch.zeros((1, 1), dtype=torch.long)
print('\nGenerated Text -->',decode(model.generate(idx, 100)[0].tolist()))

Logits --> torch.Size([32, 65])
Loss --> tensor(4.2446)

Generated Text --> YlIhbjU3ux
QgFVLIjIdwzPeLIlXk:NS.wmjMx:''R:$TE,Mw-DACoelQ,QCoTBWZbEs?pmIE'ha.-au?GULI,h;NaXN.
Q&wXpEk


In [60]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

batch_size = 32

for i in range(10001):
    xb,yb = get_batch("train")
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i % 1000 == 0:
        print(f"Loss at {i} step: {loss.item()}")

Loss at 0 step: 2.4603488445281982
Loss at 1000 step: 2.461993455886841
Loss at 2000 step: 2.468071699142456
Loss at 3000 step: 2.438849687576294
Loss at 4000 step: 2.7277064323425293
Loss at 5000 step: 2.525634765625
Loss at 6000 step: 2.4847307205200195
Loss at 7000 step: 2.4815597534179688
Loss at 8000 step: 2.513474702835083
Loss at 9000 step: 2.43355131149292
Loss at 10000 step: 2.547548532485962


In [62]:
print('\nGenerated Text -->',decode(model.generate(idx, 1000)[0].tolist()))


Generated Text --> YOLOLAns k ond t bo prerts cuno-me MELIS:

NELL: mive, won hacame I d wnyoy.
Te mem thoong G meyor?
Thanestoorarg tlow t htis moorere? baskeind in t es, ES:
Fore hest by, mespef we.
Bokse. y llittrgan mpen spumafficand ailinowine, tsunonarssbrt me an ns ter he thencongory more Hat oto h, eathod; intheeanounat I iso t m Je.
I mor thiracunghithyspotit GHOf' a mereyer ovore
d gule,
MICapotes s bent
Pomy by ire,
Th ry 's.

INouralis ce the indy bauncthelk my, he wer
Thakioro VIst fad o, r w, ct.
My't ofed ss ve nd II:
IOFote vespe, maindrf the g aterdell m tomasen vitonou f t therore w lasar?
taint s CUMEL:

ARor g wese BE atof plelealrt dy NCUCK:
TABr.

AR:
Lane, y momy umanoowand, ONCE a m th, ad thet, are my,
Thatomeano,-br.

Fatht, batouof atharsay.
t

GLEOut awe.
GA enererveid dstharond wns t nds, mbormyours st qunds t iave s;
Ay:
SO:
Hos h toug rinon ghacaletind air veacr be
ABYos ethee'seeast whio y il, y ais veno ter chan d sexe thathondinld athy, pre fall irowa

In [69]:
eval_iters = 200
@torch.no_grad()

def estimateloss():
    output = {}
    for split in ('train', 'val'):
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            logits, loss = model(x, y)
            losses[k] = loss.item()
        output[split] = losses.mean()
    return output

In [73]:
n_embed = 32
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.q = nn.Linear(n_embed, head_size, bias=False) 
        self.k = nn.Linear(n_embed, head_size, bias=False) 
        self.v = nn.Linear(n_embed, head_size, bias=False) 
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B,T,C = x.shape
        q = self.q(x)
        k = self.k(x)
        weight = q @ k.transpose(-2,-1)/ (C**0.5)
        weight = weight.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        weight = F.softmax(weight, dim=-1)
        v = self.v(x)
        output = weight @ v
        return output
 

In [76]:
# Using Self - Attention

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.sa_head = Head(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
    
    def forward(self, idx, targets=None): 
        B,T = idx.shape 
        token_embed = self.token_embedding_table(idx)
        position_embed = self.position_embedding_table(torch.arange(T))
        x = token_embed + position_embed
        x = self.sa_head(x)
    
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
        
model = BigramLanguageModel(vocab_size)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for iter in range(6001):
    if iter % 500 == 0:
        losses = estimateloss()   
        print(f"iter: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
    
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

context = torch.zeros((1,1), dtype=torch.long)
print('\nGenerated Text -->')
print(decode(model.generate(context, max_new_tokens=1000)[0].tolist()))

iter: 0, train loss: 4.144351959228516, val loss: 4.1462721824646
iter: 500, train loss: 2.670701026916504, val loss: 2.6704397201538086
iter: 1000, train loss: 2.5157458782196045, val loss: 2.5134007930755615
iter: 1500, train loss: 2.465735912322998, val loss: 2.488044261932373
iter: 2000, train loss: 2.4485418796539307, val loss: 2.450505256652832
iter: 2500, train loss: 2.417445421218872, val loss: 2.4401233196258545
iter: 3000, train loss: 2.4144177436828613, val loss: 2.4353299140930176
iter: 3500, train loss: 2.3961381912231445, val loss: 2.410214424133301
iter: 4000, train loss: 2.38267183303833, val loss: 2.4129457473754883
iter: 4500, train loss: 2.3866395950317383, val loss: 2.406050205230713
iter: 5000, train loss: 2.384495973587036, val loss: 2.3933897018432617
iter: 5500, train loss: 2.3681063652038574, val loss: 2.398341655731201
iter: 6000, train loss: 2.382218599319458, val loss: 2.3885114192962646

Generated Text -->
Yos,
Bousr yeee owit-3-hy orof se panto chind ste w

In [78]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)

In [79]:
# Using MultiHead Attention
# Using Self - Attention

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.sa_head = MultiHeadAttention(4, n_embed//4)
        self.lm_head = nn.Linear(n_embed, vocab_size)
    
    def forward(self, idx, targets=None): 
        B,T = idx.shape 
        token_embed = self.token_embedding_table(idx)
        position_embed = self.position_embedding_table(torch.arange(T))
        x = token_embed + position_embed
        x = self.sa_head(x)
    
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
        
model = BigramLanguageModel(vocab_size)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for iter in range(6001):
    if iter % 500 == 0:
        losses = estimateloss()   
        print(f"iter: {iter}, train loss: {losses['train']}, val loss: {losses['val']}")
    
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

context = torch.zeros((1,1), dtype=torch.long)
print('\nGenerated Text -->')
print(decode(model.generate(context, max_new_tokens=1000)[0].tolist()))

iter: 0, train loss: 4.264521598815918, val loss: 4.258363723754883
iter: 500, train loss: 2.6452715396881104, val loss: 2.6466567516326904
iter: 1000, train loss: 2.4621636867523193, val loss: 2.456190586090088
iter: 1500, train loss: 2.3988797664642334, val loss: 2.4059195518493652
iter: 2000, train loss: 2.3525640964508057, val loss: 2.371034860610962
iter: 2500, train loss: 2.3161797523498535, val loss: 2.334740400314331
iter: 3000, train loss: 2.294144630432129, val loss: 2.316192388534546
iter: 3500, train loss: 2.271512031555176, val loss: 2.2918736934661865
iter: 4000, train loss: 2.258976697921753, val loss: 2.286926031112671
iter: 4500, train loss: 2.2420597076416016, val loss: 2.290806770324707
iter: 5000, train loss: 2.2474329471588135, val loss: 2.2719106674194336
iter: 5500, train loss: 2.235119581222534, val loss: 2.2514097690582275
iter: 6000, train loss: 2.2345893383026123, val loss: 2.265702724456787

Generated Text -->
YGlor he ist, the heple, the beffor onve,
Thar n