In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

import lightning as L
from torch.utils.data import TensorDataset, DataLoader

In [3]:
token_to_id = {
    'what':0,
    'is':1,
    'StatQuest':2,
    'Awesome':3,
    '<EOS>':4,
}

id_to_token = dict(map(reversed, token_to_id.items()))

inputs = torch.tensor([
    [
        token_to_id['what'],
        token_to_id['is'],
        token_to_id['StatQuest'],
        token_to_id['<EOS>'],
    ],
    [
        token_to_id['StatQuest'],
        token_to_id['is'],
        token_to_id['what'],
        token_to_id['<EOS>'],
    ]
])

outputs = torch.tensor([
    [
        token_to_id['is'],
        token_to_id['StatQuest'],
        token_to_id['<EOS>'],
        token_to_id['Awesome'],
        token_to_id['<EOS>'],
    ],
    [
        token_to_id['is'],
        token_to_id['what'],
        token_to_id['<EOS>'],
        token_to_id['Awesome'],
        token_to_id['<EOS>'],
    ]
])

dataset = TensorDataset(inputs, outputs)
dataloader = DataLoader(dataset)

In [29]:
class PositionalEncoding(nn.Module):
    def __init__(self, max_len=6, d_model=2):
        super().__init__()

        pe = torch.zeros(max_len, d_model)

        position = torch.arange(0, max_len, 1).float().unsqueeze(1)
        embedding = torch.arange(0, d_model, 1).float()

        div_term = 1/torch.tensor(1000) ** (embedding * d_model)
        
        pe[:, ::2] = torch.sin(position/div_term) 
        pe[:, 1::2] = torch.cos(position/div_term) 
        self.register_buffer('pe', pe)

    def forward(self, word_embeddings):
        return word_embeddings + self.pe[:word_embeddings.size(0), :]
    

Attention (Q, K, V) = SoftMax ((Q.K^T)/sqrt(d_k) + M) V

In [36]:
class Attention(nn.Module):
    def __init__(self, d_model=2):
        super().__init__()

        self.W_q = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_k = nn.Linear(in_features=d_model, out_features=d_model, bias=False)
        self.W_v = nn.Linear(in_features=d_model, out_features=d_model, bias=False)

        self.row_dim = 0
        self.col_dim = 1

    def forward(self, encodings_for_q, encodings_for_k, encodings_for_v, mask=None):
        q = self.W_q(encodings_for_q)
        k = self.W_k(encodings_for_k)
        v = self.W_v(encodings_for_v)

        sims = torch.matmul(q, k.transpose())
        scaled_sims = sims/torch.tensor(k.size(self.col_dim) ** 0.5)

        if mask is not None:
            scaled_sims = scaled_sims.masked_fill(mask=mask, value=-1e9)
        
        attention_values = F.softmax(scaled_sims, dim=self.col_dim)
        attention_score = torch.matmul(attention_values, v)
        
        return attention_score



In [41]:
class DecoderOnlyTransformer(L.LightningModule):
    def __init__(self, seq_len = 4, max_len=6, d_model=2):
        super().__init__()

        self.we = nn.Embedding(num_embeddings=seq_len, embedding_dim=d_model)
        self.pe = PositionalEncoding(max_len=max_len, d_model=d_model)
        self.self_attention = Attention(d_model=d_model)
        self.fc_layer = nn.Linear(in_features=d_model, out_features=seq_len)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, token_ids):
        word_embeddings = self.we(token_ids)
        positional_encoding = self.pe(word_embeddings)

        mask = torch.tril(torch.ones(token_ids.size(dim=0), token_ids.size(dim=0)))
        mask = mask == 0

        self_attention_values = self.self_attention(positional_encoding, positional_encoding, positional_encoding, mask=mask)
        residual_connections = self_attention_values + positional_encoding
        fc_ouput_layer =  self.fc_layer(residual_connections)
        return fc_ouput_layer
    
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=0.1)

    def training_step(self, batch):
        input_tokens, labels = batch
        output = self.forward(input_tokens[0])
        loss = self.loss(output, labels[0])
        return loss

In [None]:
model = DecoderOnlyTransformer()
model_input = torch.tensor([
    token_to_id['what'],
    token_to_id['is'],
    token_to_id['StatQuest'],
    token_to_id['<EOS'],
])

input_length = model_input.size(0)
predictions = model(model_input)
predicted_id = 

In [None]:
max_len = 6
d_model=2

pe = torch.zeros(6, 2)
position = torch.arange(0, max_len, 1).float().unsqueeze(1)
embedding = torch.arange(0, d_model, 1).float()
div_term = 1/torch.tensor(1000) ** (embedding * d_model)
pe[:,::2] = torch.sin(position*div_term[0]) 
pe[:, 1::2] = torch.cos(position*div_term[0])
print("Positional Encoding returning: \n", pe)
pe.size(0)
pe.size()

mask = torch.ones(5,5)
mask
token_to_id = {
    'what':0,
    'is':1,
    'StatQuest':2,
    'Awesome':3,
    '<EOS>':4,
}
model_input = torch.tensor([
    token_to_id['what'],
    token_to_id['is'],
    token_to_id['StatQuest'],
    token_to_id['<EOS>'],
])

input_length = model_input.size(0)
print(input_length)

Positional Encoding returning: 
 tensor([[ 0.0000,  1.0000],
        [ 0.8415,  0.5403],
        [ 0.9093, -0.4161],
        [ 0.1411, -0.9900],
        [-0.7568, -0.6536],
        [-0.9589,  0.2837]])
4


In [49]:
for i in range(4,10):
    print(i)

4
5
6
7
8
9
