In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F

import sentencepiece as spm

In [2]:
EMBEDDING_SIZE = 128
CONTEXT_SIZE = 256
ENCODER_TRANSFORMER_COUNT = 6
DECODER_TRANSFORMER_COUNT = 6
NUM_HEADS = 8

In [3]:
df = pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")

df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [4]:
english_all = df["English words/sentences"].values.astype(str)
french_all = df["French words/sentences"].values.astype(str)

english_all = [seq.replace("\u202f", "") for seq in english_all]
french_all = [seq.replace("\u202f", "") for seq in french_all]

english_all = [seq.lower() for seq in english_all]
french_all = [seq.lower() for seq in french_all]

print(english_all[0:5])
print(french_all[0:5])

['hi.', 'run!', 'run!', 'who?', 'wow!']
['salut!', 'cours!', 'courez!', 'qui ?', 'ça alors!']


In [5]:
sp_en = spm.SentencePieceProcessor()
sp_en.load("/kaggle/input/french-english-tokenization/other/100embed/1/en.wiki.bpe.vs10000.model")

sp_fr = spm.SentencePieceProcessor()
sp_fr.load("/kaggle/input/french-english-tokenization/other/100embed/1/fr.wiki.bpe.vs10000.model")

en_vocab = sp_en.get_piece_size()
fr_vocab = sp_fr.get_piece_size()

padding_token_id_en = sp_en.pad_id()
print("Padding token ID for english:", padding_token_id_en)

padding_token_id_fr = sp_fr.pad_id()
print("Padding token ID for frencb:", padding_token_id_fr)


print("EN Vocab Size: ", en_vocab)
print("FR Vocab Size: ", fr_vocab)

Padding token ID for english: -1
Padding token ID for frencb: -1
EN Vocab Size:  10000
FR Vocab Size:  10000


In [6]:
with open("/kaggle/working/en_vocab.text", "w", encoding="utf-8") as f:
    # Iterate over all token IDs
    for i in range(en_vocab):
        # Convert token ID to token
        token = sp_en.id_to_piece(i)
        # Write token to file
        f.write(str(i) + " : " + token + "\n")

with open("/kaggle/working/fr_vocab.text", "w", encoding="utf-8") as f:
    # Iterate over all token IDs
    for i in range(fr_vocab):
        # Convert token ID to token
        token = sp_fr.id_to_piece(i)
        # Write token to file
        f.write(str(i) + " : " + token + "\n")

In [7]:
english_data = [sp_en.encode_as_ids(seq) for seq in english_all]
french_data = [sp_fr.encode_as_ids(seq) for seq in french_all]

print(english_data[0:5])
print(french_data[0:5])

[[40, 9916, 9935], [888, 9960], [888, 9960], [305, 9967], [15, 90, 9960]]
[[1021, 138, 9977], [926, 9977], [844, 6607, 9977], [135, 2340], [9558, 494, 9977]]


In [8]:
# Single Attention Head
class SingleHeadedAttention(nn.Module):
    
    def __init__(self, emb_size, head_size, context_size:int, dropout=0.2):
        super().__init__()
        self.key = nn.Linear(emb_size, head_size, bias=False)
        self.query = nn.Linear(emb_size, head_size, bias=False)
        self.value = nn.Linear(emb_size, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, masked=True):
        B,T, C = x.shape
        
        k = self.key(x)   # Size (B, T, head_size)
        q = self.query(x) # same thing..
        v = self.value(x) # same thing..
        
        weight = q @ k.transpose(-2, -1) * (k.shape[-1]**-0.5) # this equation is defined in the original paper and the multiplication part is normalization over each Time Serie in the batch.
        if(masked): weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) # This will mask the upper triangle of zeros and turn it into -inf for the softmax func
        weight = F.softmax(weight, dim=-1)
        weight = self.dropout(weight)

        v = self.value(x)
        
        out = weight @ v
        
        return out

In [9]:
class MultiHeadedAttention(nn.Module):
    
    def __init__(self, embed_size, num_heads, head_size, context_size, dropout=0.2):
        super().__init__()
        self.heads = nn.ModuleList([SingleHeadedAttention(emb_size,head_size, context_size, dropout) for _ in range(num_heads)])
        self.linear = nn.Linear(head_size*num_heads, emb_size)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.linear(out)
        out = self.dropout(out)
        return out

In [10]:
class FeedForward(nn.Module):
    
    def __init__(self, emb_size, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
        nn.Linear(emb_size, emb_size*4),
        nn.ReLU(),
        nn.Linear(emb_size*4, emb_size),
        nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)

In [11]:
class TransformerBlock(nn.Module):
    
    def __init__(self, embed_size, num_heads, context_size, dropout=0.2):
        super().__init__()
        head_size = embed_size // num_heads
        self.norm1 = nn.LayerNorm(embed_size)
        self.attention = MultiHeadedAttention(embed_size, num_heads, head_size, context_size, dropout)
        self.norm2 = nn.LayerNorm(embed_size)
        self.ff = FeedForward(embed_size, dropout)
        
    def forward(self, x):
        
        x_norm1 = self.norm1(x)
        x = x + self.attention(x_norm1)
        x_norm2 = self.norm2(x)
        x = x + self.ff(x_norm2)
        
        return x
        

In [12]:
class Encoder(nn.Module):
    
    def __init__(self,transformer_count , vocab_size, embed_size, context_size, dropout=0.2):
        super().__init__()
        
        self.token_embed = nn.Embedding(vocab_size, embed_size)
        self.pos_embed = nn.Embedding(context_size, embed_size)
        self.transformers = nn.Sequential(*[TransformerBlock(embed_size, num_heads, context_size, dropout=0.2) for _ in range(transformer_count)])
        
    def forward(self, x):
        B, T = x.shape
        
        tk_emb = self.token_embed(x)
        pos_emb = self.pos_embed(x)
        
        x = tk_emb + pos_emb
        x = self.transformers(x)
        
        return x

In [13]:
class FrenchEnglishTransformer(nn.Module):
    
    def __init__(self, transformer_count , vocab_size, embed_size, context_size, dropout=0.2):
        super().__init__()
        self.encoder = Encoder(transformer_count , vocab_size, embed_size, context_size, dropout)
        self.decoder = Decoder()