In [50]:
import torch
import torch.nn as nn 
import torch.nn.functional as F 

In [51]:
torch.__version__


'2.3.1'

In [52]:
#hypyer parameters 
batch_size = 64
block_size = 256
max_iters = 5000

eval_interval = 500
learning_rate = 3e-4
device ='cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

n_embd = 384
n_head = 6 
n_layer = 6
dropout = 0.2

In [53]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7f7c2754eb10>

In [54]:
with open('../input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [55]:
# crate vocabulary list (sorted)
char = sorted(list(set(text)))
vocab_size = len(char)
vocab_size

65

In [56]:
stoi = {char: i for i, char in enumerate(char)}
itos = {i: char for i, char in enumerate(char)}


In [57]:
encoding = lambda word: [stoi[i] for i in word]
# use '' to concatenate str 
decoding = lambda number: ''.join(itos[i] for i in number)

In [58]:
#training ans tesing split 
data = torch.tensor(encoding(text), dtype=torch.int64)
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

In [59]:
# Data loading 
# Batch in training and validation
def get_batch(set_name):
    data = (train_data if set_name == 'train' else  val_data)
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i: i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    x, y = x.to(device), y.to(device)
    
    return x, y

In [60]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out 
        

In [61]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
#         X W + b 
        k = self.key(x)
        q = self.query(x)
        # attention score 
        wei = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5 # aiming to control it between 0 - 1 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        # weighted sum ---? weights @ value (x)
        out = wei @ v # (B, T, hs)
        return out

In [62]:
#multihead attention
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init_()
        # define a group of heads (nn.ModuleList including multiple head instance)
        self.heads = nn.ModuleList([Head(head_size)for _ in range(num_heads)])
        
        self.proj = nn.Linear(head_size * num_heads, n_embd) 
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torcg.cat([h(x) for h in self.heads], dim=-1)
        # here's how we make num_heads * head_size to -->>>>> n_embd 
        out = self.dropout(selfl.proj(out))

In [63]:
# FeedForward 
class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropoutp)
        )
    def forward(self, x):
        return self.net(x)


In [64]:
#leogo multihead + feedforward  --> one complete transformer block! 
class Block(nn.Module):
    #transformer block 
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [65]:
class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # unpacking then  Sequential can eat all of it!
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_emb)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        self.apply(self._ini_weights)
        
    def _ini_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstancens(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    def forward(self, idx, target=None):
        B, T = idx.shape 
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
         x = tok_emb + pos_emb
            
        
        
        

SyntaxError: incomplete input (1465324524.py, line 15)

In [104]:
a = torch.empty(2, 3, 4)
b = torch.empty(3, 4)
torch.nn.init.uniform_(a, 0, 1)
torch.nn.init.uniform_(b, 0, 1)
print(a, '\n'*3, b, '\n'*3, a+b, '\n'*3, (a+b).shape)

tensor([[[0.4133, 0.0743, 0.4261, 0.0756],
         [0.4052, 0.8919, 0.2820, 0.1234],
         [0.0215, 0.5440, 0.1508, 0.9835]],

        [[0.5190, 0.9077, 0.6784, 0.6193],
         [0.6125, 0.4988, 0.8030, 0.7649],
         [0.2671, 0.7841, 0.3953, 0.7965]]]) 


 tensor([[0.0754, 0.9921, 0.4319, 0.4778],
        [0.2967, 0.1423, 0.1599, 0.1825],
        [0.5124, 0.7816, 0.0497, 0.8778]]) 


 tensor([[[0.4888, 1.0663, 0.8580, 0.5534],
         [0.7019, 1.0343, 0.4419, 0.3059],
         [0.5339, 1.3256, 0.2005, 1.8613]],

        [[0.5944, 1.8998, 1.1103, 1.0971],
         [0.9092, 0.6412, 0.9629, 0.9474],
         [0.7795, 1.5657, 0.4449, 1.6742]]]) 


 torch.Size([2, 3, 4])


In [105]:
a = torch.ones(2, 3, 4)
b = torch.ones(3, 4)
print(a, '\n'*3, b, '\n'*3, a+b, '\n'*3, (a+b).shape)

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]]) 


 tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]) 


 tensor([[[2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.]],

        [[2., 2., 2., 2.],
         [2., 2., 2., 2.],
         [2., 2., 2., 2.]]]) 


 torch.Size([2, 3, 4])


In [92]:
jem = nn.Embedding(4, 5)  # T, emb_length
T = 4
jynxzz_pos = jem(torch.arange(T))
print(jynxzz_pos.shape)
print(jynxzz_pos)

torch.Size([4, 5])
tensor([[ 1.5377,  0.0980,  1.1364,  0.9033,  0.7364],
        [-0.8446, -0.5388, -0.4207,  0.2035,  2.5754],
        [-0.3707, -1.8258,  1.3161,  0.5602, -1.1064],
        [-0.9140, -0.6755, -0.4562,  0.4358, -1.3870]],
       grad_fn=<EmbeddingBackward0>)


In [93]:
nn.Embedding(4, 5)(torch.arange(T))

tensor([[-1.7444,  0.1363,  0.9441, -2.5530, -0.5702],
        [ 0.9616,  0.3674, -0.9939, -2.2958,  1.3728],
        [-1.6306, -1.3637, -1.3968, -1.0523, -1.0860],
        [-0.2111,  2.0888, -0.2270, -1.0792, -0.7351]],
       grad_fn=<EmbeddingBackward0>)

In [97]:
jynxzz_pos.shape #(T, C)

torch.Size([4, 5])

In [46]:
# embedding x --> B, T, C
