## This notebook is an implementation of transformers from scratch for learning purposes - Work in progress 

In [81]:
import torch
from math import sqrt
from pathlib import Path
import torch.nn as nn
# Decided to use the tokenizers library for BPE tokenization to understand how tokenization works under the hood
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

### Embedding with Byte pair encoding

In [None]:
class BpeTokenizer():
    def __init__(self, vocab_size = 32_000):
        self.bpe_tokenizer = Tokenizer(BPE(unk_token="[UNK]"))  # With byte-level BPE, [UNK] should be rare
        self.bpe_tokenizer.pre_tokenizer = ByteLevel()
        self.bpe_tokenizer.decoder = ByteLevelDecoder()

        self.trainer = BpeTrainer(
            vocab_size=vocab_size,
            min_frequency=2,
            special_tokens=["[PAD]", "[UNK]", "[BOS]", "[EOS]"],
            initial_alphabet=ByteLevel.alphabet(),
        )
        self.pad_id = None
        self.unk_id = None
        self.bos_id = None
        self.eos_id = None

    def train(self, input_text_paths, output_tokenizer_path):
        self.bpe_tokenizer.train(input_text_paths, self.trainer)
        self.bpe_tokenizer.save(output_tokenizer_path)
        self.cache_special_tokens()

    def load(self, tokenizer_path):
        self.bpe_tokenizer = Tokenizer.from_file(tokenizer_path)
        self.cache_special_tokens()
    
    def cache_special_tokens(self):
        self.pad_id = self.bpe_tokenizer.token_to_id("[PAD]")
        self.unk_id = self.bpe_tokenizer.token_to_id("[UNK]")
        self.bos_id = self.bpe_tokenizer.token_to_id("[BOS]")
        self.eos_id = self.bpe_tokenizer.token_to_id("[EOS]")

    def encode_input(self, text):
        return self.bpe_tokenizer.encode(text).ids
    def encode_encoder_input(self, text):
        encoding = self.bpe_tokenizer.encode(text).ids + [self.EOS_ID]
        return encoding
    def encode_decoder_input(self, text):
        encoding = [self.BOS_ID] + self.bpe_tokenizer.encode(text).ids 
        return encoding
    def encode_target(self, text):
        encoding = self.bpe_tokenizer.encode(text).ids + [self.EOS_ID]
        return encoding
    
    def decode(self, ids):
        return self.bpe_tokenizer.decode(ids, skip_special_tokens=True)
    

class TokenEmbedding(nn.Module):
    def __init__(self, d_model, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)  

    def forward(self, ids):
        return self.embedding(ids)


In [None]:
# Just testing here
train_bpe_tokenizer("./input_text.txt", "./tokenizer.json", vocab_size=32_000)

emb = TokenEmbedding("./tokenizer.json", d_model=512)
ids = emb.encode("hello world!")
x = emb(ids)  
print(ids.shape)

torch.Size([11])


In [109]:
ids = emb.encode("hello world!")
ids

tensor([224,  75, 269,  79,  82, 224,  90, 262,  79,  71,   4])

### Sinusoidal Positional encoding to add position info to tokens

In [84]:
# Here we make sure to specify the dtype when creating the ten  sors to avoid casting to float64 and having everything go slower
# Will add max_len_seq later to compare speed
class PositionalEncoding(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.d_model = d_model

    def forward(self, embedded):
        d_model = embedded.shape[1]
        seq_length = embedded.shape[0]
        pos = torch.arange(0, seq_length, device=embedded.device, dtype=embedded.dtype).unsqueeze(1) # (seq_length, 1)
        i = torch.arange(0, d_model, 2, device=embedded.device, dtype=embedded.dtype) # (d_model/2,)
        positional_encoding = torch.zeros_like(embedded)
        div_term = torch.exp(-(torch.log(torch.tensor(10_000.0, device=embedded.device, dtype=embedded.dtype)) * (i/d_model)))    
        positional_encoding[:, 0::2] = torch.sin(pos/div_term)
        positional_encoding[:, 1::2] = torch.cos(pos/div_term)
        return embedded + positional_encoding

In [85]:
encoded_input = positional_encoding(embedded)

In [86]:
encoded_input.shape

torch.Size([11, 512])

### Simple attention layer (To understand the concept)

In [87]:
class AttentionLayer(torch.nn.Module):
    def __init__(self, dim_k, dim_q, dim_v, dim_d):
        super().__init__()
        self.W_Q = torch.nn.Parameter(torch.randn(dim_d, dim_q))
        self.W_K = torch.nn.Parameter(torch.randn(dim_d, dim_k))
        self.W_V = torch.nn.Parameter(torch.randn(dim_d, dim_v))
        self.dim_d = dim_d
    def forward(self, x):
        Q = x @self.W_Q
        K = x @self.W_K
        V = x @self.W_V
        attention_scores = torch.softmax(Q@K.T/sqrt(self.dim_d))
        attention = attention_scores @ V
        return attention

### Upping the level here, added multi head

In [88]:
class MultiHeadAttentionLayer(torch.nn.Module):
    def __init__(self, dim_k, dim_q, dim_v, dim_d, num_heads):
        super().__init__()
        self.W_Q = nn.Parameter(torch.randn(num_heads, dim_d, dim_q))
        self.W_K = nn.Parameter(torch.randn(num_heads, dim_d, dim_k))
        self.W_V = nn.Parameter(torch.randn(num_heads, dim_d, dim_v))
        self.dim_d = dim_d
        self.scale = sqrt(dim_d)
    def forward(self, x):
        Q = x @self.W_Q
        K = x @self.W_K
        V = x @self.W_V
        attention_scores = torch.softmax(Q@K.transpose(-2, -1)/self.scale, dim=-1)
        attention = attention_scores @ V # (num_heads, seq_length, dim_v)
        attention = attention.transpose(0, 1) # (seq_length, num_heads, dim_v)
        flattened_attention = attention.reshape(attention.shape[0], -1) 
        projected_attention = flattened_attention @ nn.Parameter(torch.randn(flattened_attention.shape[1], 512))
        return projected_attention, K, V

In [89]:

multi_head_attention_layer = MultiHeadAttentionLayer(dim_k=512, dim_q=512, dim_v=512, dim_d=512, num_heads=8)
output_layer = multi_head_attention_layer.forward(encoded_input)

In [90]:
print(output_layer.shape)

AttributeError: 'tuple' object has no attribute 'shape'

### Add & Norm Layer definition

In [None]:
class ResidualLayer(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self, sublayer_output, previous_layer_output):
        return sublayer_output + previous_layer_output



# Post Normalization is used here, like in the original Transformer paper
class LayerAddAndNormLayer(nn.Module):
    def __init__(self, dim_d, e = 1e-6):
        super().__init__()
        self.epsilon = e
        self.dim_d = dim_d
        self.gamma = torch.nn.Parameter(torch.ones(self.dim_d))
        self.beta = torch.nn.Parameter(torch.zeros(self.dim_d))
        self.residual = ResidualLayer()
    def normalization(self, x):
        mean = torch.mean(x, dim = -1, keepdim=True)
        variance = torch.var(x, dim = -1, unbiased=False, keepdim=True)
        normalized_x = (x-mean)/(torch.sqrt(variance+self.epsilon))
        normalized_x = self.gamma * normalized_x + self.beta
        return normalized_x 
    def forward(self, sublayer_output, previous_layer_output):
        return self.normalization(self.residual(previous_layer_output, sublayer_output))


### Position-wise fully connected layer

In [None]:
class FullyConnectedLayer(nn.Module):
    def __init__(self, in_dim, hidden_dims, out_dim):
        super().__init__()
        self.in_dim = in_dim
        self.hidden_dims = hidden_dims
        self.out_dim = out_dim
        layers = []
        prev = in_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.ReLU())
            prev = h
        layers.append(nn.Linear(prev, out_dim))
        self.layers = nn.Sequential(*layers)
    def forward(self, x):
        return self.layers(x)

### Well.. we need a Masked MHA and a cross-attention MHA now for the decoder

In [None]:
class MaskedMultiHeadAttentionLayer(torch.nn.Module):
    def __init__(self, dim_d, num_heads):
        super().__init__()
        self.dim_k = self.dim_v = self.dim_q = dim_d // num_heads
        self.W_Q = nn.Parameter(torch.randn(num_heads, dim_d, self.dim_q))
        self.W_K = nn.Parameter(torch.randn(num_heads, dim_d, self.dim_k))
        self.W_V = nn.Parameter(torch.randn(num_heads, dim_d, self.dim_v))
        self.dim_d = dim_d
        self.scale = sqrt(dim_d)
        self.W = nn.Parameter(torch.randn(num_heads * self.dim_v, 512))
    def forward(self, x):
        Q = x @self.W_Q
        K = x @self.W_K
        V = x @self.W_V
        temp = Q@K.transpose(-2, -1)/self.scale
        j = torch.arange(0, x.shape[0], device=x.device).unsqueeze(0)
        i = torch.arange(0, x.shape[0], device=x.device).unsqueeze(1)
        temp = temp.masked_fill(i<j, float('-inf'))
        attention_scores = torch.softmax(temp, dim=-1)
        attention = attention_scores @ V # (num_heads, seq_length, dim_v)
        attention = attention.transpose(0, 1) # (seq_length, num_heads, dim_v)
        flattened_attention = attention.reshape(attention.shape[0], -1) 
        projected_attention = flattened_attention @ self.W
        return projected_attention
    

class MultiHeadCrossAttentionLayer(nn.Module):
    def __init__(self, dim_d=512, num_heads=8):
        super().__init__()
        self.dim_k = self.dim_v = self.dim_q = dim_d // num_heads
        self.W_Q = nn.Parameter(torch.randn(num_heads, dim_d, self.dim_q))
        self.W_K = nn.Parameter(torch.randn(num_heads, dim_d, self.dim_k))
        self.W_V = nn.Parameter(torch.randn(num_heads, dim_d, self.dim_v))
        self.dim_d = dim_d
        self.scale = sqrt(dim_d)
        self.W = nn.Parameter(torch.randn(num_heads * self.dim_v, 512))
    def forward(self, x, encoder_output):
        Q = x @self.W_Q
        K = encoder_output @self.W_K
        V = encoder_output @self.W_V
        attention_scores = torch.softmax(Q@K.transpose(-2,-1)/self.scale, dim=-1)
        attention = attention_scores @ V # (num_heads, seq_length, dim_v)
        attention = attention.transpose(0, 1) # (seq_length, num_heads, dim_v)
        flattened_attention = attention.reshape(attention.shape[0], -1)
        projected_attention = flattened_attention @ self.W
        return projected_attention

In [None]:
# self.embedding = Embedding()
# self.positional_encoding = PositionalEncoding(d_model=dim_d)
class EncoderBlock(nn.Module):
    def __init__(self, dim_d = 512, num_heads=8):
        super().__init__()
        self.multi_head_attention = MultiHeadAttentionLayer(dim_d=dim_d, num_heads=num_heads)
        self.fully_connected_layer = FullyConnectedLayer(in_dim = dim_d, hidden_dims = [2048], out_dim = dim_d)
        self.residual = ResidualLayer()
        self.layer_add_and_norm_1 = LayerAddAndNormLayer(dim_d=dim_d)
        self.layer_add_and_norm_2 = LayerAddAndNormLayer(dim_d=dim_d)
    def forward(self, x):
        attention_output, _, _ = self.multi_head_attention(x)
        x = self.layer_add_and_norm_1(attention_output, x)
        fc_output = self.fully_connected_layer(x)
        x = self.layer_add_and_norm_2(fc_output, x)
        return x
    
class DecoderBlock(nn.Module):
    def __init__(self, dim_d = 512, num_heads=8):
        super().__init__()
        self.masked_multi_head_attention = MaskedMultiHeadAttentionLayer(dim_d=dim_d, num_heads=num_heads)
        self.layer_add_and_norm_1 = LayerAddAndNormLayer(dim_d=dim_d)
        self.layer_add_and_norm_2 = LayerAddAndNormLayer(dim_d=dim_d)
        self.layer_add_and_norm_3 = LayerAddAndNormLayer(dim_d=dim_d)
        self.multi_head_cross_attention = MultiHeadCrossAttentionLayer( dim_d=dim_d, num_heads=num_heads)
        self.fully_connected_layer = FullyConnectedLayer(in_dim = dim_d, hidden_dims = [2048], out_dim = dim_d)
    def forward(self, x, encoder_output):
        masked_attention_output = self.masked_multi_head_attention(x)
        x = self.layer_add_and_norm_1(masked_attention_output, x)
        cross_attention_output = self.multi_head_cross_attention(x, encoder_output)
        x = self.layer_add_and_norm_2(cross_attention_output, x)
        fc_output = self.fully_connected_layer(x)
        x = self.layer_add_and_norm_3(fc_output, x)
        return x
        
 

### Full transformer architecture

In [None]:
class Transformer(nn.Module):
    def __init__(self, num_encoder_layers, num_heads, dim_d, num_decoder_layers, vocab_size):
        super().__init__()
        self.dim_d = dim_d
        self.embedding = Embedding()
        self.position_encoding = PositionalEncoding(d_model=dim_d)
        self.encoder_layers = nn.Sequential(*[EncoderBlock(dim_d=dim_d, num_heads=num_heads) for _ in range(num_encoder_layers)])
        self.decoder_layers = nn.ModuleList([DecoderBlock(dim_d=dim_d, num_heads=num_heads) for _ in range(num_decoder_layers)])
        self.linear_layer = nn.Linear(in_features=dim_d, out_features=vocab_size)
        self.softmax_layer = nn.Softmax(dim = -1)
    def forward(self, input_sequence, target_sequence):
        embedded_input = self.embedding.embed(input_sequence, self.dim_d)
        encoded_input = self.position_encoding(embedded_input)
        
        embedded_output = self.embedding.embed(target_sequence, self.dim_d)
        encoded_target = self.position_encoding(embedded_output)

        encoder_output = self.encoder_layers(encoded_input)

        decoder_output  = encoded_target
        for layer in self.decoder_layers:
            decoder_output = layer(decoder_output , encoder_output)

        output_linear = self.linear_layer(decoder_output)
        final_output = self.softmax_layer(output_linear)
        return final_output

### Now we prepare our data for training

In [100]:
datasets_path = "./dataset"
french_dataset_path = "./dataset/train_fr.txt"
english_dataset_path = "./dataset/train_en.txt"


with open(french_dataset_path, 'r', encoding='utf-8') as f:
    french_lines = f.readlines()
train_size = int(len(french_lines) * 0.8)
french_lines_train = french_lines[:train_size]
french_lines_val = french_lines[train_size:]

with open(english_dataset_path, 'r', encoding='utf-8') as f:
    english_lines = f.readlines()
english_lines_train = english_lines[:train_size]
english_lines_val = english_lines[train_size:]


all_train_sentences = french_lines_train + english_lines_train

# Creating our vocab 
train_bpe_tokenizer([french_dataset_path, english_dataset_path], "./tokenizer.json", vocab_size=32_000)

# emb = TokenEmbedding("./tokenizer.json", d_model=512)
# ids = emb.encode("hello world!")
# x = emb(ids)  
# print(ids.shape)



Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"[PAD]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":1, "content":"[UNK]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":2, "content":"[BOS]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}, {"id":3, "content":"[EOS]", "single_word":False, "lstrip":False, "rstrip":False, "normalized":False, "special":True}], normalizer=None, pre_tokenizer=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), post_processor=None, decoder=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), model=BPE(dropout=None, unk_token="[UNK]", continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={"[PAD]":0, "[UNK]":1, "[BOS]":2, "[EOS]":3, "!":4, """:5, "#":6, "$":7, "%":8, "&":9, "'":10, 

In [106]:
emb.EOS_ID

AttributeError: 'TokenEmbedding' object has no attribute 'EOS_ID'

tensor([224,  75, 269,  79,  82, 224,  90, 262,  79,  71,   4])

In [107]:
class IWSLTDataset(torch.utils.data.Dataset):
    def __init__(self, french_sentences, english_sentences, tokenizer_path):
        self.french_sentences = french_sentences
        self.english_sentences = english_sentences
        self.tokenizer = TokenEmbedding.from_file(tokenizer_path)

    def __len__(self):
        return len(self.french_sentences)
    
    def __getitem__(self, idx):
        french_sentence = self.french_sentences[idx]
        english_sentence = self.english_sentences[idx]
        
        # We're translating from french to english, so the encoder input is going to be french
        french_sentence = french_sentence + "[EOS]" 
        french_ids = self.tokenizer.encode(french_sentence)
        french_embeddings = self.tokenizer.forward(french_ids)

        english_sentence_decoder_input = "[BOS]" + english_sentence
        english_sentence_target = english_sentence + "[EOS]"

        english_ids_decoder_input = self.tokenizer.encode(english_sentence_decoder_input)
        english_ids_target = self.tokenizer.encode(english_sentence_target)
        
        english_embeddings_decoder_input = self.tokenizer.forward(english_ids_decoder_input)    

        
        return french_embeddings, english_embeddings_decoder_input, english_ids_target