In [8]:
# some important imports
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
with open("dataset.txt", "r", encoding='utf-8') as file:
    text = file.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

1115393


In [None]:
def tokenize(string):
    return [chars.index(c) for c in string]
def decode(l):
    return "".join([chars[i] for i in l])

data = torch.tensor(tokenize(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

In [11]:
train_data = data[:int(0.9*len(data))]
val_data = data[int(0.9*len(data)):]

In [12]:
seq_len = 8
batch_size = 8

def get_batch(batch_size, seq_len, split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - seq_len, (batch_size,))
    x = torch.stack([data[i:i+seq_len] for i in ix])
    y = torch.stack([data[i+1:i+seq_len+1] for i in ix])
    return x, y

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)

In [35]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x

In [36]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros(T, T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x

In [37]:
torch.allclose(xbow2, xbow3)

True

In [None]:
class SelfAttention(nn.Module):
    """Applying Self Attention for one Head"""

    def __init__(self, head_size, n_embd, seq_len):
        super().__init__()
        
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)

        self.register_buffer('tril', torch.tril(torch.ones(seq_len, seq_len)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        affinities = q @ k.transpose(-2, -1) * C**-0.5
        affinities = affinities.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        affinities = F.softmax(affinities, dim=-1)

        out = affinities @ v
        return out

class MultiHeadAttention(nn.Module):
    """Multiple Heads of Self Attention in parallel"""

    def __init__(self, n_embd, seq_len, n_heads):
        super().__init__()
        head_size = n_embd // n_heads
        self.heads = nn.ModuleList([SelfAttention(head_size, n_embd, seq_len) for _ in range(n_heads)])
    def forward(self, x):
        return torch.cat([h(x) for h in self.heads], dim=-1)

class MLP(nn.Module):
    def __init__(self, n_embd, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)