## Import Libraries and load data

In [224]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import plotly.express as px
import plotly.graph_objects as go
import math


In [225]:
text_dataset_path = '/Users/alijanatiidr/Desktop/ML_mini_gpt_data.md'
text_dataset = open(text_dataset_path, 'r').read()

text_dataset

"Principal Component Analysis (PCA) is a widely used technique in the field of statistics and machine learning for dimensionality reduction and data visualization. It is particularly useful when dealing with high-dimensional data, where the number of features or variables is large. PCA works by transforming the original features of the data into a new set of orthogonal (uncorrelated) features called principal components, which are linear combinations of the original features. These principal components capture the most significant patterns in the data.\n\nHow does PCA work?\n\nStep 1: Standardize the Data\nBefore applying PCA, it is essential to standardize the data by subtracting the mean and dividing by the standard deviation of each feature. Standardization ensures that all features have the same scale, which is a prerequisite for PCA.\n\nWhy is standardization necessary before performing PCA?\n\nStep 2: Compute the Covariance Matrix\nNext, PCA calculates the covariance matrix of th

## Exploratory data analysis

In [226]:
# get number of words in the dataset
words = text_dataset.split()
print("Number of words in the dataset:", len(words))

# get number of characters in the dataset
characters = list(text_dataset)
print("Number of characters in the dataset:", len(characters))

Number of words in the dataset: 1774
Number of characters in the dataset: 11676


In [227]:
# getting unique characters in the dataset
set_characters = set(characters)
set_characters

{'\n',
 ' ',
 "'",
 '(',
 ')',
 ',',
 '-',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'Y',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [228]:
len(set_characters)

64

## Data preprocessing

In [229]:
# Tokenizing characters
def tokenize(text):
    return list(text)

In [230]:
tokenized_text = tokenize(text_dataset)

In [231]:
# getting vocabulary characters
def vocabulary(text):
    return set(tokenize(text))

In [232]:
vocabulary = vocabulary(text_dataset)

In [233]:
#defining char2ind dictionnary
def char2ind(vocabulary):
    dict = {}
    dict['<pad>'] = 0
    dict['<unk>'] = 1
    i = 2
    for char in vocabulary:
        dict[char] = i
        i += 1
    return dict

In [234]:
char2ind = char2ind(vocabulary)

In [235]:
len(char2ind)

66

In [236]:
vocab_size = len(char2ind)

In [237]:
# defining ind2char dictionnary as the inverse of char2ind dictionnary
def ind2char(char2ind):
    dict = {}
    for key, value in char2ind.items():
        dict[value] = key
    return dict

In [238]:
ind2char = ind2char(char2ind)

In [239]:
# defining encoding function
def encode(text, char2ind):
    return [char2ind[char] for char in text]

# defining decoding function
def decode(encoded_text, ind2char):
    return ''.join([ind2char[ind] for ind in encoded_text])

In [240]:
encode('hii my name is Ali', char2ind)

[4, 58, 58, 55, 48, 42, 55, 51, 40, 48, 6, 55, 58, 37, 55, 50, 12, 58]

In [241]:
decode([4, 58, 58, 55, 48, 42, 55, 51, 40, 48, 6, 55, 58, 37, 55, 50, 12, 58], ind2char)

'hii my name is Ali'

In [242]:
data = torch.tensor(encode(text_dataset, char2ind), dtype=torch.long)

In [243]:
n = len(data)
train_data = data[0:int(n*0.9)]
val_data = data[int(n*0.9):]

In [244]:
torch.manual_seed(42)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [245]:
xb, yb = get_batch('train')
print('xb:', xb)
print('yb:', yb)

xb: tensor([[54, 55, 27,  4,  6, 42, 55, 43],
        [55, 40, 62, 62, 12, 58, 25, 40],
        [51, 10, 55, 40, 12, 10,  3, 34],
        [ 3, 12, 12,  3, 43, 37, 28, 38]])
yb: tensor([[55, 27,  4,  6, 42, 55, 43,  3],
        [40, 62, 62, 12, 58, 25, 40, 64],
        [10, 55, 40, 12, 10,  3, 34, 58],
        [12, 12,  3, 43, 37, 28, 38, 38]])


## Modelization

### Baseline: Bigram Language Model:

In [246]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    
    def forward(self, idx, targets=None):
        
        logits = self.token_embedding_table(idx)

        if targets is None:
            loss = None
        
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size*block_size)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    
    def generate(self, idx, max_new_tokens):
        
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        
        return idx

In [247]:
m = BigramLanguageModel(vocab_size)

output, loss = m(xb, yb)

output, loss

(tensor([[-1.8309e+00, -9.7959e-01,  1.5799e+00,  ...,  9.0619e-01,
          -1.4567e+00,  1.2727e+00],
         [-4.5077e-01,  5.0381e-01, -2.0755e+00,  ...,  4.3554e-01,
           1.2247e+00,  5.1886e-01],
         [-1.4647e+00, -3.0163e-02,  3.9969e-01,  ...,  4.4142e-01,
           1.6581e+00,  2.5313e-01],
         ...,
         [-9.9675e-01, -6.0219e-01,  3.2189e-01,  ...,  6.1104e-01,
           6.6490e-01, -2.4996e-01],
         [-1.0176e-01, -9.6634e-01, -1.4232e+00,  ..., -9.3004e-01,
          -2.8298e-01,  6.6000e-01],
         [ 2.1618e-01, -6.1230e-01, -2.5462e-01,  ...,  1.7409e+00,
           4.6923e-05,  5.6878e-01]], grad_fn=<ViewBackward0>),
 tensor(4.6203, grad_fn=<NllLossBackward0>))

In [248]:
idx = torch.zeros((1, 1), dtype=torch.long)

decode(m.generate(idx, 10).numpy().tolist()[0], ind2char)

'<pad>L F.oDoYlH'

## Training model

In [249]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
steps = 20000

losses = []

for step in range(steps):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

In [250]:
# Plotting the loss
fig = px.line(x=np.arange(steps), y=losses, title='Loss over epochs')

fig.show()

In [112]:
# Doing prediction on new data point
input = 'what is a vector?'
input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
input_encoded = input_encoded.unsqueeze(0)

decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char)

'what is a vector?\nUngo ak pthe hes ue'

## Self attention

In [114]:
batch_size = 4
block_size = 8
vocab_size = 2

x = torch.randn((batch_size, block_size, vocab_size))
x.shape

torch.Size([4, 8, 2])

In [123]:
x

tensor([[[-0.0470,  1.1231],
         [-0.3980,  0.4761],
         [ 0.7145, -0.0154],
         [ 1.5968, -0.3504],
         [ 0.1245,  1.4490],
         [-1.1434, -0.1307],
         [-0.9633,  1.4046],
         [-0.9711,  2.0414]],

        [[-0.3421, -0.8216],
         [-1.0037,  0.7118],
         [ 0.8081, -0.5612],
         [ 1.3075,  2.0661],
         [ 0.3007,  0.1488],
         [ 1.0561, -0.0376],
         [-1.8802,  0.8287],
         [ 0.5677, -0.0795]],

        [[-0.7784, -0.5206],
         [-0.0158, -1.1109],
         [-2.3689, -0.7339],
         [-3.7241,  2.1162],
         [-0.2957, -1.3169],
         [ 0.8085, -0.3373],
         [ 0.4601, -0.1192],
         [-0.5604,  0.2683]],

        [[-0.2962, -0.2525],
         [ 0.6893, -0.2628],
         [-0.3736,  0.7189],
         [-1.2256, -1.2476],
         [ 0.1402, -0.3484],
         [-1.5517,  0.3469],
         [ 0.4018, -0.5623],
         [-1.4200,  1.6965]]])

In [128]:
# In each batch, we average each token embedding with the previous token embeddings
xbow = torch.zeros((batch_size, block_size, vocab_size))
for n_batch in range(batch_size):
    for token in range(block_size):
        x_prev = x[n_batch, :token+1, :]
        xbow[n_batch, token, :] = x_prev.mean(dim=0)
print(xbow)

# Efficient way to do the same thing
wei = torch.tril(torch.ones((block_size, block_size)))
wei = wei/wei.sum(dim=1, keepdim=True)
xbow = wei @ x
print(xbow)

tensor([[[-0.0470,  1.1231],
         [-0.2225,  0.7996],
         [ 0.0898,  0.5279],
         [ 0.4666,  0.3083],
         [ 0.3981,  0.5365],
         [ 0.1412,  0.4253],
         [-0.0166,  0.5652],
         [-0.1359,  0.7497]],

        [[-0.3421, -0.8216],
         [-0.6729, -0.0549],
         [-0.1793, -0.2237],
         [ 0.1924,  0.3488],
         [ 0.2141,  0.3088],
         [ 0.3544,  0.2510],
         [ 0.0352,  0.3336],
         [ 0.1018,  0.2819]],

        [[-0.7784, -0.5206],
         [-0.3971, -0.8158],
         [-1.0544, -0.7885],
         [-1.7218, -0.0623],
         [-1.4366, -0.3132],
         [-1.0624, -0.3172],
         [-0.8449, -0.2890],
         [-0.8093, -0.2193]],

        [[-0.2962, -0.2525],
         [ 0.1966, -0.2576],
         [ 0.0065,  0.0679],
         [-0.3015, -0.2610],
         [-0.2132, -0.2785],
         [-0.4363, -0.1742],
         [-0.3165, -0.2297],
         [-0.4545,  0.0111]]])
tensor([[[-0.0470,  1.1231],
         [-0.2225,  0.7996],
      

## Adding fully connected layer to the baseline Bigram Language Model to get logits:

In [255]:
vocab_size = len(char2ind)
dim_embed = 32
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size): # Getting logits as the input of a linear layer instead of looking them up in a logits embedding table.
        super().__init__() 
        self.token_embedding_table = nn.Embedding(vocab_size, dim_embed) 
        self.lm_head = nn.Linear(dim_embed, vocab_size)

    
    def forward(self, idx, targets=None):
        
        batch_size, block_size = idx.shape
        tok_emb = self.token_embedding_table(idx)  # (batch_size, block_size, dim_embed)
        x = tok_emb #+ pos_emb # (batch_size, block_size, dim_embed)
        logits = self.lm_head(x) # (batch_size, block_size, vocab_size)

        if targets is None:
            loss = None
        
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size*block_size)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss

    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [256]:
# training the model
m = BigramLanguageModel(vocab_size)
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
batch_size = 32
steps = 20000

losses = []

for step in range(steps):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())

# Plotting the loss
fig = px.line(x=np.arange(steps), y=losses, title='Loss over epochs')

fig.show()

In [258]:
# Doing prediction on new data point
input = 'what is a vector?'

input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
input_encoded = input_encoded.unsqueeze(0)

decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char)

'what is a vector?\n\nIfe che ring t te '

In [None]:
tril = torch.tril(torch.ones((block_size, block_size)))
wei = torch.zeros((block_size, block_size))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

## Transformer decoder using single self attention:

In [184]:
class Head(nn.Module):
    """One head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(dim_embed, head_size, bias=False)
        self.query = nn.Linear(dim_embed, head_size, bias=False)
        self.value = nn.Linear(dim_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))

    def forward(self, x):
        batch_size, block_size, dim_embed = x.shape
        k = self.key(x) # (batch_size, block_size, head_size)
        q = self.query(x) # (batch_size, block_size, head_size)
        v = self.value(x) # (batch_size, block_size, head_size)
        w = q @ k.transpose(-2, -1) # (batch_size, block_size, block_size)
        w = w / math.sqrt(head_size)
        w = w.masked_fill(self.tril[:block_size, :block_size]==0, float('-inf'))
        w = F.softmax(w, dim=-1)
        y = w @ v
        return y

class SingleHeadDecoder(nn.Module):
    """Decoder with a single layer of masked self-attention"""

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, dim_embed)
        self.position_embedding_table = nn.Embedding(block_size, dim_embed)
        self.sa_head = Head(dim_embed)
        self.lm_head = nn.Linear(dim_embed, vocab_size)

    def forward(self, idx, targets=None):
        batch_size, block_size = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(block_size))
        x = tok_emb + pos_emb
        x = self.sa_head(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size * block_size)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [186]:
batch_size = 32
block_size = 8
max_iters = 5000
learning_rate = 1e-3
eval_interval = 500
eval_iters = 200
dim_embed = 32
head_size = dim_embed

m = SingleHeadDecoder()
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

losses = []
for step in range(max_iters):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if step % eval_interval == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
    if step % eval_iters == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
        input = 'what is a vector?'
        input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
        input_encoded = input_encoded.unsqueeze(0)
        print(decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char))

# Plotting the loss
fig = px.line(x=np.arange(max_iters), y=losses, title='Loss over epochs')

fig.show()

step: 0, loss: 4.213343620300293
step: 0, loss: 4.213343620300293
what is a vector?aCFEBqVuf'wq)qVrk?<unk>V
step: 200, loss: 2.91680908203125
what is a vector?tiutlsaubn fmyac fiu
step: 400, loss: 2.7141788005828857
what is a vector?3inmht alfer pentsab
step: 500, loss: 2.646226644515991
step: 600, loss: 2.5328330993652344
what is a vector??yyhesuvedogion s th
step: 800, loss: 2.3713560104370117
what is a vector?

Wyi afersme prabim
step: 1000, loss: 2.375694513320923
step: 1000, loss: 2.375694513320923
what is a vector?

nte thet cit f nfp
step: 1200, loss: 2.315241575241089
what is a vector?

W Ahesimatheus, to
step: 1400, loss: 2.3304522037506104
what is a vector?




Al:
'cis atend 
step: 1500, loss: 2.3942909240722656
step: 1600, loss: 2.278327703475952
what is a vector?

The prerionture or
step: 1800, loss: 2.2478461265563965
what is a vector?

Whhy Hou printisep
step: 2000, loss: 2.2064731121063232
step: 2000, loss: 2.2064731121063232
what is a vector?

Tuh ePAt is.

Con 
step: 

In [195]:
test_string = "What is gradient descent?"
test_string_encoded = torch.tensor(encode(test_string, char2ind), dtype=torch.long)
test_string_encoded = test_string_encoded.unsqueeze(0)

decode(m.generate(test_string_encoded, 30).numpy().tolist()[0], ind2char)

'What is gradient descent?\nUplianuteptampts apathianduro'

## Transformer decoder using multi head self attention

In [197]:
class MultiHeadAttention(nn.Module):
    """Multi-head attention"""
    
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

    def forward(self, x):
        y = torch.cat([head(x) for head in self.heads], dim=-1)
        return y

class MultiHeadDecoder(nn.Module):
    """Decoder with multiple layers of masked self-attention"""

    def __init__(self, num_heads):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, dim_embed)
        self.position_embedding_table = nn.Embedding(block_size, dim_embed)
        self.sa_head = MultiHeadAttention(num_heads, dim_embed)
        self.lm_head = nn.Linear(num_heads * dim_embed, vocab_size)

    def forward(self, idx, targets=None):
        batch_size, block_size = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(block_size))
        x = tok_emb + pos_emb
        x = self.sa_head(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            batch_size, block_size, vocab_size = logits.shape
            logits = logits.view(batch_size * block_size, vocab_size)
            targets = targets.view(batch_size * block_size)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [214]:
num_heads = 20
max_iters = 50000
m = MultiHeadDecoder(num_heads)
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

losses = []

for step in range(max_iters):
    xb, yb = get_batch('train')
    optimizer.zero_grad()
    logits, loss = m(xb, yb)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if step % eval_interval == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
    if step % eval_iters == 0:
        print('step: {}, loss: {}'.format(step, loss.item()))
        input = 'what is a vector?'
        input_encoded = torch.tensor(encode(input, char2ind), dtype=torch.long)
        input_encoded = input_encoded.unsqueeze(0)
        print(decode(m.generate(input_encoded, 20).numpy().tolist()[0], ind2char))

# Plotting the loss
fig = px.line(x=np.arange(max_iters), y=losses, title='Loss over epochs')

fig.show()

step: 0, loss: 4.281468868255615
step: 0, loss: 4.281468868255615
what is a vector?5gwFt<unk>yu,ly1wpl)d3Ol
step: 200, loss: 2.072388172149658
what is a vector?renpgrandis. athe .e
step: 400, loss: 1.98736572265625
what is a vector?mationt ho sutpreanc
step: 500, loss: 1.8767958879470825
step: 600, loss: 1.7356830835342407
what is a vector?


Ahe coGe the dugd
step: 800, loss: 1.7489986419677734
what is a vector?

Ssp, the ut Re byw
step: 1000, loss: 1.7035306692123413
step: 1000, loss: 1.7035306692123413
what is a vector?

Gate for mache opt
step: 1200, loss: 1.4908819198608398
what is a vector?ired ction is comati
step: 1400, loss: 1.6026664972305298
what is a vector?

St:
A
For the data
step: 1500, loss: 1.3257548809051514
step: 1600, loss: 1.4411872625350952
what is a vector?

Sbludatm feature p
step: 1800, loss: 1.5492795705795288
what is a vector?

As bleased for For
step: 2000, loss: 1.4378161430358887
step: 2000, loss: 1.4378161430358887
what is a vector?

Inedicenal paramet
st

In [221]:
test_string = "What's the best metric for imbalanced data when doing classification?"
test_string_encoded = torch.tensor(encode(test_string, char2ind), dtype=torch.long)

test_string_encoded = test_string_encoded.unsqueeze(0)

x = decode(m.generate(test_string_encoded, 30).numpy().tolist()[0], ind2char)
prompt = test_string 
completion = x[len(prompt):]

print('Prompt: {}'.format(prompt))

print('Completion: {}'.format(completion))

Prompt: What's the best metric for imbalanced data when doing classification?
Completion: 

Summation operates or No. Th
