## Import Libraries and load data

In [41]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import plotly.express as px
import plotly.graph_objects as go
import math

In [42]:
text_dataset_path = '/Users/alijanatiidr/Desktop/Prog/Projects/mini_gpt/ML_mini_gpt_data.md'
text_dataset = open(text_dataset_path, 'r').read()

text_dataset

"Principal Component Analysis (PCA) is a widely used technique in the field of statistics and machine learning for dimensionality reduction and data visualization. It is particularly useful when dealing with high-dimensional data, where the number of features or variables is large. PCA works by transforming the original features of the data into a new set of orthogonal (uncorrelated) features called principal components, which are linear combinations of the original features. These principal components capture the most significant patterns in the data.\n\nHow does PCA work?\n\nStep 1: Standardize the Data\nBefore applying PCA, it is essential to standardize the data by subtracting the mean and dividing by the standard deviation of each feature. Standardization ensures that all features have the same scale, which is a prerequisite for PCA.\n\nWhy is standardization necessary before performing PCA?\n\nStep 2: Compute the Covariance Matrix\nNext, PCA calculates the covariance matrix of th

## Exploratory data analysis

In [43]:
# get number of words in the dataset
words = text_dataset.split()
print("Number of words in the dataset:", len(words))

# get number of characters in the dataset
characters = list(text_dataset)
print("Number of characters in the dataset:", len(characters))

Number of words in the dataset: 1774
Number of characters in the dataset: 11676


In [44]:
# getting unique characters in the dataset
set_characters = set(characters)
set_characters

{'\n',
 ' ',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [45]:
len(set_characters)

64

## Data preprocessing

In [46]:
# Tokenizing characters
def tokenize(text):
    return list(text)

In [47]:
tokenized_text = tokenize(text_dataset)

In [48]:
# getting vocabulary characters
def vocabulary(text):
    return set(tokenize(text))

In [49]:
vocabulary = vocabulary(text_dataset)

In [50]:
#defining char2ind dictionnary
def char2ind(vocabulary):
    dict = {}
    dict['<pad>'] = 0
    dict['<unk>'] = 1
    i = 2
    for char in vocabulary:
        dict[char] = i
        i += 1
    return dict

In [51]:
char2ind = char2ind(vocabulary)

In [52]:
len(char2ind)

66

In [53]:
vocab_size = len(char2ind)

In [54]:
# defining ind2char dictionnary as the inverse of char2ind dictionnary
def ind2char(char2ind):
    dict = {}
    for key, value in char2ind.items():
        dict[value] = key
    return dict

In [55]:
ind2char = ind2char(char2ind)

In [56]:
# defining encoding function
def encode(text, char2ind):
    return [char2ind[char] for char in text]

# defining decoding function
def decode(encoded_text, ind2char):
    return ''.join([ind2char[ind] for ind in encoded_text])

In [57]:
encode('hii my name is Ali', char2ind)

[49, 11, 11, 65, 51, 13, 65, 3, 46, 51, 6, 65, 11, 64, 65, 61, 62, 11]

In [58]:
decode([4, 58, 58, 55, 48, 42, 55, 51, 40, 48, 6, 55, 58, 37, 55, 50, 12, 58], ind2char)

"xcc?u(?m4ue?c'?vbc"

In [59]:
data = torch.tensor(encode(text_dataset, char2ind), dtype=torch.long)

In [60]:
n = len(data)
train_data = data[0:int(n*0.9)]
val_data = data[int(n*0.9):]

In [61]:
torch.manual_seed(42)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [62]:
xb, yb = get_batch('train')
print('xb:', xb)
print('yb:', yb)

xb: tensor([[20, 65, 53, 49,  6, 13, 65, 38],
        [65, 46, 29, 29, 62, 11, 58, 46],
        [ 3, 59, 65, 46, 62, 59, 21, 31],
        [21, 62, 62, 21, 38, 64, 33,  8]])
yb: tensor([[65, 53, 49,  6, 13, 65, 38, 21],
        [46, 29, 29, 62, 11, 58, 46, 41],
        [59, 65, 46, 62, 59, 21, 31, 11],
        [62, 62, 21, 38, 64, 33,  8,  8]])


## Modelization

### Baseline: Bigram Language Model:

In [63]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    
    def forward(self, idx, targets=None):
        
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size*block_size)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    
    def generate(self, idx, max_new_tokens):
        
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        
        return idx

In [64]:
m = BigramLanguageModel(vocab_size)

output, loss = m(xb, yb)

output, loss

(tensor([[ 0.4425,  0.1942, -0.5000,  ..., -1.3185,  1.1874,  0.0276],
         [-1.3393, -0.4502, -0.1935,  ..., -0.4996,  1.1848, -1.1461],
         [ 0.6526, -0.2256,  0.8545,  ...,  1.2022,  0.5476, -0.5097],
         ...,
         [-0.7873, -0.1165,  1.8994,  ..., -0.8241,  0.5042, -1.0075],
         [ 0.3109, -0.1064, -1.2389,  ...,  0.3660, -0.6754, -0.3209],
         [ 0.3533, -2.6475, -1.4575,  ...,  0.7114, -0.9086,  0.3130]],
        grad_fn=<ViewBackward0>),
 tensor(4.6860, grad_fn=<NllLossBackward0>))

In [65]:
idx = torch.zeros((1, 1), dtype=torch.long)

decode(m.generate(idx, 10).numpy().tolist()[0], ind2char)

'<pad>T?LPn.nOb '

## Training model

In [66]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
steps = 20000

losses = []

for step in range(steps):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

In [67]:
# Plotting the loss
fig = px.line(x=np.arange(steps), y=losses, title='Loss over epochs')

fig.show()

In [68]:
# Doing prediction on new data point
input = 'what is a vector?'
input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
input_encoded = input_encoded.unsqueeze(0)

decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char)

'what is a vector?\n\n\nLEit rpresignvess'

## Adding fully connected layer to the baseline Bigram Language Model to get logits:

In [69]:
vocab_size = len(char2ind)
dim_embed = 32
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size): # Getting logits as the input of a linear layer instead of looking them up in a logits embedding table.
        super().__init__() 
        self.token_embedding_table = nn.Embedding(vocab_size, dim_embed) 
        self.lm_head = nn.Linear(dim_embed, vocab_size)

    
    def forward(self, idx, targets=None):
        
        batch_size, block_size = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (batch_size, block_size, dim_embed)
        x = tok_emb # (batch_size, block_size, dim_embed)
        logits = self.lm_head(x) # (batch_size, block_size, vocab_size)

        if targets is None:
            loss = None
        
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size*block_size)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [70]:
# training the model
m = BigramLanguageModel(vocab_size)
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
batch_size = 32
steps = 20000

losses = []

for step in range(steps):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

# Plotting the loss
fig = px.line(x=np.arange(steps), y=losses, title='Loss over epochs')

fig.show()

In [71]:
# Doing prediction on new data point
input = 'what is a vector?'

input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
input_encoded = input_encoded.unsqueeze(0)

decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char)

'what is a vector?\n\n\nWhonicon onder ci'

## Transformer decoder using single self attention:

In [72]:
head_size = 32

class Head(nn.Module):
    """One head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(dim_embed, head_size, bias=False)
        self.query = nn.Linear(dim_embed, head_size, bias=False)
        self.value = nn.Linear(dim_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

    def forward(self, x):
        batch_size, block_size, dim_embed = x.shape
        k = self.key(x) # (batch_size, block_size, head_size)
        q = self.query(x) # (batch_size, block_size, head_size)
        v = self.value(x) # (batch_size, block_size, head_size)
        w = q @ k.transpose(-2, -1) # (batch_size, block_size, block_size)
        w = w / math.sqrt(head_size)
        w = w.masked_fill(self.tril[:block_size, :block_size]==0, float('-inf'))
        w = F.softmax(w, dim=-1)
        y = w @ v # (batch_size, block_size, head_size)
        return y

class SingleHeadDecoder(nn.Module):
    """Decoder with a single layer of masked self-attention"""

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, dim_embed)
        self.position_embedding_table = nn.Embedding(block_size, dim_embed)
        self.sa_head = Head(dim_embed)
        self.lm_head = nn.Linear(dim_embed, vocab_size)

    def forward(self, idx, targets=None):
        batch_size, block_size = idx.shape
        tok_emb = self.token_embedding_table(idx) # (batch_size, block_size, dim_embed)
        pos_emb = self.position_embedding_table(torch.arange(block_size))  # (block_size, dim_embed)
        x = tok_emb + pos_emb # (batch_size, block_size, dim_embed)
        x = self.sa_head(x) # (batch_size, block_size, dim_embed)
        logits = self.lm_head(x) # (batch_size, block_size, vocab_size)

        if targets is None:
            loss = None
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size * block_size)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:] 
            logits, loss = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [73]:
input_encoded = torch.tensor([encode(input, char2ind)], dtype=torch.long)
input_encoded

tensor([[38, 49, 46, 41, 65, 11, 64, 65, 46, 65, 50,  6, 58, 41, 21, 31, 55]])

In [74]:
input_encoded[:, -block_size:]

tensor([[65, 50,  6, 58, 41, 21, 31, 55]])

In [75]:
position_embedding_table = nn.Embedding(block_size, dim_embed)
torch.arange(8)

tensor([0, 1, 2, 3, 4, 5, 6, 7])

In [76]:
position_embedding_table(torch.tensor([1])).shape

torch.Size([1, 32])

In [77]:
batch_size = 32
block_size = 8
max_iters = 5000
learning_rate = 1e-3
eval_interval = 500
eval_iters = 200
dim_embed = 32
head_size = dim_embed

m = SingleHeadDecoder()
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

losses = []
for step in range(max_iters):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if step % eval_interval == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
    if step % eval_iters == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
        input = 'what is a vector?'
        input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
        input_encoded = input_encoded.unsqueeze(0)
        print(decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char))

# Plotting the loss
fig = px.line(x=np.arange(max_iters), y=losses, title='Loss over epochs')

fig.show()

step: 0, loss: 4.1099934577941895
step: 0, loss: 4.1099934577941895
what is a vector?F;ypn;seopukM)'nTo
'
step: 200, loss: 2.941740036010742
what is a vector?s saliprs me 
H adea
step: 400, loss: 2.6006782054901123
what is a vector? s elre e -tcad bige
step: 500, loss: 2.64255428314209
step: 600, loss: 2.627239227294922
what is a vector?

' dretabrensa ya.

step: 800, loss: 2.472869396209717
what is a vector?

Wh en cal res toca
step: 1000, loss: 2.3977949619293213
step: 1000, loss: 2.3977949619293213
what is a vector?

PCCAe?
A cheant or
step: 1200, loss: 2.2406671047210693
what is a vector?


Nen Axfatren thet
step: 1400, loss: 2.2397966384887695
what is a vector?

StmoncAn, ly ces.

step: 1500, loss: 2.357679843902588
step: 1600, loss: 2.2306034564971924
what is a vector?

Us al. Theciompriv
step: 1800, loss: 2.2273221015930176
what is a vector?

Itres ion reofm. a
step: 2000, loss: 2.259688377380371
step: 2000, loss: 2.259688377380371
what is a vector?

Ad d ign?
CAdad mo
step: 220

In [195]:
test_string = "What is gradient descent?"
test_string_encoded = torch.tensor(encode(test_string, char2ind), dtype=torch.long)
test_string_encoded = test_string_encoded.unsqueeze(0)

decode(m.generate(test_string_encoded, 30).numpy().tolist()[0], ind2char)

'What is gradient descent?\nUplianuteptampts apathianduro'

## Transformer decoder using multi head self attention

In [78]:
class MultiHeadAttention(nn.Module):
    """Multi-head attention"""
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) 

    def forward(self, x):
        y = torch.cat([head(x) for head in self.heads], dim=-1) # (batch_size, block_size, num_heads * head_size)
        return y

class FeedForward(nn.Module):
    """Feed Forward Layer"""

    def __init__(self, dim_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim_embed, dim_embed),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.net(x)




class MultiHeadDecoder(nn.Module):
    """Decoder with multiple layers of masked self-attention"""

    def __init__(self, num_heads):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, dim_embed) 
        self.position_embedding_table = nn.Embedding(block_size, dim_embed) 
        self.sa_head = MultiHeadAttention(num_heads, dim_embed) 
        self.ff_head = FeedForward(num_heads*dim_embed) 
        self.lm_head = nn.Linear(num_heads * dim_embed, vocab_size)

    def forward(self, idx, targets=None):
        batch_size, block_size = idx.shape
        tok_emb = self.token_embedding_table(idx) # (batch_size, block_size, dim_embed)
        pos_emb = self.position_embedding_table(torch.arange(block_size)) # (block_size, dim_embed)
        x = tok_emb + pos_emb # (batch_size, block_size, dim_embed)
        x = self.sa_head(x) # (batch_size, block_size, num_heads * dim_embed)
        x = self.ff_head(x) # (batch_size, block_size, num_heads * dim_embed)
        logits = self.lm_head(x) # (batch_size, block_size, vocab_size)

        if targets is None:
            loss = None
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size * block_size)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [79]:
num_heads = 20
max_iters = 50000
m = MultiHeadDecoder(num_heads)
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

losses = []

for step in range(max_iters):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if step % eval_interval == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
    if step % eval_iters == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
        input = 'what is a vector?'
        input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
        input_encoded = input_encoded.unsqueeze(0)
        print(decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char))

# Plotting the loss
fig = px.line(x=np.arange(max_iters), y=losses, title='Loss over epochs')

fig.show()

step: 0, loss: 4.234764575958252
step: 0, loss: 4.234764575958252
what is a vector?hiI obc?VxdUuCfRo.Nn
step: 200, loss: 2.0900380611419678
what is a vector? in ts armla ba)es.

step: 400, loss: 1.5220463275909424
what is a vector?ons.

Adam dospoles 
step: 500, loss: 1.6201518774032593
step: 600, loss: 1.539300799369812
what is a vector?

The for the aw dat
step: 800, loss: 1.4823873043060303
what is a vector?

Hows ca chesentil 
step: 1000, loss: 1.4612046480178833
step: 1000, loss: 1.4612046480178833
what is a vector?

What into acvely r
step: 1200, loss: 1.3517048358917236
what is a vector?ing are the formace 
step: 1400, loss: 1.172753095626831
what is a vector?

Adam (selecting a 
step: 1500, loss: 1.2027772665023804
step: 1600, loss: 1.1136609315872192
what is a vector?

Step 4: Seled eapp
step: 1800, loss: 1.0897972583770752
what is a vector?

Step 4: Selontinue
step: 2000, loss: 1.0532292127609253
step: 2000, loss: 1.0532292127609253
what is a vector?

Adam Optive opput 
step:

In [80]:
# validation loss
xb, yb = get_batch('val')
logits, val_loss = m(xb, yb)
val_loss

tensor(1.9697, grad_fn=<NllLossBackward0>)

In [81]:
test_string = "what is gradient boosting?"
test_string_encoded = torch.tensor(encode(test_string, char2ind), dtype=torch.long)

test_string_encoded = test_string_encoded.unsqueeze(0)

x = decode(m.generate(test_string_encoded, 30).numpy().tolist()[0], ind2char)
prompt = test_string 
completion = x[len(prompt):]

print('Prompt: {}'.format(prompt))

print('Completion: {}'.format(completion))

Prompt: what is gradient boosting?
Completion:  principal Components captured


In [113]:
batch_size = 32
block_size = 10
max_iters = 50000
learning_rate = 1e-3
eval_interval = 500
eval_iters = 200
dim_embed = 32
head_size = dim_embed
num_heads = 8
dropout = 0.1

class LayerNorm1d:
    """Layer Normalization"""

    def __init__(self, dim, eps=1e-5):
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        self.eps = eps

    def __call__(self, x):
        mean = x.mean(1, keepdim=True)
        std = x.var(1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta
    
    def parameters(self):
        return [self.gamma, self.beta]

class Head(nn.Module):
    """One head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(dim_embed, head_size, bias=False)
        self.query = nn.Linear(dim_embed, head_size, bias=False)
        self.value = nn.Linear(dim_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

    def forward(self, x):
        batch_size, block_size, dim_embed = x.shape
        k = self.key(x) # (batch_size, block_size, head_size)
        q = self.query(x) # (batch_size, block_size, head_size)
        v = self.value(x) # (batch_size, block_size, head_size)
        w = q @ k.transpose(-2, -1) # (batch_size, block_size, block_size)
        w = w / math.sqrt(head_size)
        w = w.masked_fill(self.tril[:block_size, :block_size]==0, float('-inf'))
        w = F.softmax(w, dim=-1)
        y = w @ v # (batch_size, block_size, head_size) = (batch_size, block_size, dim_embed) because head_size = dim_embed
        return y

class MultiHeadAttention(nn.Module):
    """Multi-head attention"""
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) 

    def forward(self, x):
        y = torch.cat([head(x) for head in self.heads], dim=-1) # (batch_size, block_size, num_heads * head_size) = (batch_size, block_size, num_heads * dim_embed)
        return y

class FeedForward(nn.Module):
    """Feed Forward Layer"""

    def __init__(self, dim_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim_embed, dim_embed),
            nn.ReLU(),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """Transformer Block"""

    def __init__(self, dim_embed, num_heads):
        super().__init__()
        self.sa = MultiHeadAttention(num_heads, dim_embed//num_heads)
        self.ff = FeedForward(dim_embed)
        self.ln1 = nn.LayerNorm(dim_embed)
        self.ln2 = nn.LayerNorm(dim_embed)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # (batch_size, block_size, dim_embed)
        x = x + self.ff(self.ln2(x)) # (batch_size, block_size, dim_embed)
        return x

class TransformerBlocksDecoder(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, dim_embed)
        self.position_embedding_table = nn.Embedding(block_size, dim_embed)
        self.blocks = nn.Sequential(
            Block(dim_embed, num_heads),
            Block(dim_embed, num_heads),
            Block(dim_embed, num_heads),
            Block(dim_embed, num_heads),
            nn.LayerNorm(dim_embed),
        )

        self.lm_head = nn.Linear(dim_embed, vocab_size)

    def forward(self, idx, targets=None):
        batch_size, block_size = idx.shape
        tok_emb = self.token_embedding_table(idx) # (batch_size, block_size, dim_embed)
        pos_emb = self.position_embedding_table(torch.arange(block_size)) # (block_size, dim_embed)
        x = tok_emb + pos_emb # (batch_size, block_size, dim_embed)
        x = self.blocks(x) # (batch_size, block_size, dim_embed)
        logits = self.lm_head(x) # (batch_size, block_size, vocab_size)

        if targets is None:
            loss = None
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size * block_size)
            loss = F.cross_entropy(logits, targets)

        return logits, loss


    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

## Model with blocks of multi-head self attention + feedforward

In [114]:
m = TransformerBlocksDecoder()
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)

losses = []

for step in range(max_iters):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if step % eval_interval == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
    if step % eval_iters == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
        input = 'what is a vector?'
        input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
        input_encoded = input_encoded.unsqueeze(0)
        print(decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char))

# Plotting the loss
fig = px.line(x=np.arange(max_iters), y=losses, title='Loss over epochs')

fig.show()

step: 0, loss: 4.296077728271484
step: 0, loss: 4.296077728271484
what is a vector?4Ter( 0ybiGuD ,,uCR;
step: 200, loss: 2.650601625442505
what is a vector?R,iyioc on ru
U
keof
step: 400, loss: 2.4217638969421387
what is a vector?zVt eserebre totabre
step: 500, loss: 2.292729139328003
step: 600, loss: 2.1181447505950928
what is a vector?umindse btesou. tigo
step: 800, loss: 2.171861410140991
what is a vector?
Staptaion of  to Th
step: 1000, loss: 1.9693801403045654
step: 1000, loss: 1.9693801403045654
what is a vector?onaspons canunt core
step: 1200, loss: 2.0822787284851074
what is a vector?imation mapubmof ear
step: 1400, loss: 1.7072126865386963
what is a vector?

Peand (ASced Prtea
step: 1500, loss: 1.8652311563491821
step: 1600, loss: 1.773660659790039
what is a vector?

Bangterte?

Whatas
step: 1800, loss: 1.820784568786621
what is a vector?on Prese-pines the s
step: 2000, loss: 1.8420097827911377
step: 2000, loss: 1.8420097827911377
what is a vector?

Teredect andd eabu
step: 22

In [118]:
# get latest loss and get the validation loss
xb, yb = get_batch('val')
logits, val_loss = m(xb, yb)

val_loss, losses[-1]

(tensor(1.8518, grad_fn=<NllLossBackward0>), 0.8350008726119995)

In [122]:
test_string = "Why is it important to sort eigenvalues?"
test_string_encoded = torch.tensor(encode(test_string, char2ind), dtype=torch.long)
test_string_encoded = test_string_encoded.unsqueeze(0)

decode(m.generate(test_string_encoded, 50).numpy().tolist()[0], ind2char)

'Why is it important to sort eigenvalues?\n\nUnderstanding the bias orthogon algorithm the co'

In [123]:
# number of parameters
sum([p.numel() for p in m.parameters()])

21698