## This notebook is an implementation of transformers from scratch for learning purposes - Work in progress 

In [5]:
import torch
from math import sqrt
from pathlib import Path

# Decided to use the tokenizers library for BPE tokenization to understand how tokenization works under the hood
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

In [22]:
class Embedding():
    def __init__(self):
        self.bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
        self.bpe_tokenizer.pre_tokenizer = ByteLevel()
        self.bpe_tokenizer.decoder = ByteLevelDecoder()
        self.trainer = BpeTrainer(vocab_size = 50_000, min_frequency=2, special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"], 
                                  initial_alphabet=ByteLevel.alphabet())
    def train_tokenizer(self, sequences_files: list[str]):
        self.bpe_tokenizer.train(sequences_files, self.trainer)
        self.bpe_tokenizer.save("./tokenizer.json")
     
    def encode_word(self, word: str):
        output = self.bpe_tokenizer.encode(word)
        return output
    def embed(self, tokenized_sequence, d_model: int):
        ids = tokenized_sequence.ids
        return torch.nn.Embedding(self.bpe_tokenizer.get_vocab_size(), d_model)(torch.tensor(ids))
        
embedding = Embedding()
embedding.train_tokenizer(["./input_text.txt"])
embedded = embedding.encode_word("hello world!")
embedded = embedding.embed(embedded, 512)


In [24]:
embedded.shape

torch.Size([11, 512])

In [38]:
# Here we make sure to specify the dtype when creating the tensors to avoid casting to float64 and having everything go slower

def positional_encoding(embedded):
    d_model = embedded.shape[1]
    seq_length = embedded.shape[0]
    pos = torch.arange(0, seq_length, device=embedded.device, dtype=embedded.dtype).unsqueeze(1) # (seq_length, 1)
    i = torch.arange(0, d_model, 2, device=embedded.device, dtype=embedded.dtype) # (d_model/2,)
    positional_encoding = torch.zeros_like(embedded)
    div_term = torch.exp(-(torch.log(torch.tensor(10_000.0, device=embedded.device, dtype=embedded.dtype)) * (i/d_model)))    
    positional_encoding[:, 0::2] = torch.sin(pos/div_term)
    positional_encoding[:, 1::2] = torch.cos(pos/div_term)
    return embedded + positional_encoding
    

In [39]:
encoded_input = positional_encoding(embedded)

In [40]:
encoded_input

tensor([[ 1.2551e+00,  8.7679e-01, -4.5609e-01,  ...,  1.6302e+00,
         -2.2152e+00,  8.0549e-01],
        [ 2.4942e-01,  1.0974e+00,  4.1486e-01,  ..., -2.0961e+00,
          2.0311e+00, -9.3400e-05],
        [ 1.8573e+00, -1.2435e-01,  1.9125e+00,  ...,  2.5392e-01,
         -2.4414e+00, -9.7617e-01],
        ...,
        [ 1.2422e+00,  3.3732e-02,  1.5159e+00,  ..., -7.9701e-01,
          3.2946e-01, -1.6072e-01],
        [ 2.6369e+00, -8.2525e-01,  4.0937e-01,  ...,  8.7939e-02,
         -8.4811e-02,  5.5790e-01],
        [-2.2704e+00, -1.4055e+00, -6.3212e-01,  ..., -2.3478e+00,
          2.0572e+00,  1.7999e+00]], grad_fn=<AddBackward0>)

In [None]:
class attention_layer(torch.nn.Module):
    def __init__(self, dim_k, dim_q, dim_v, dim_d):
        super().__init__()
        self.W_Q = torch.randn(dim_d, dim_q)
        self.W_K = torch.randn(dim_d, dim_k)
        self.W_V = torch.randn(dim_d, dim_v)
        self.dim_d = dim_d
    def forward(self, x):
        Q = x @self.W_Q
        K = x @self.W_K
        V = x @self.W_V
        attention_scores = torch.softmax(Q@K.T/sqrt(self.dim_d))
        attention = attention_scores @ V
        return attention