In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F

import sentencepiece as spm

In [2]:
EMBEDDING_SIZE = 128
CONTEXT_SIZE = 256
TRANSFORMER_COUNT = 6
NUM_HEADS = 8
DROPOUT = 0.2
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

print(DEVICE)

cpu


In [3]:
df = pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")

df.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [4]:
english_all = df["English words/sentences"].values.astype(str)
french_all = df["French words/sentences"].values.astype(str)

english_all = [seq.replace("\u202f", "") for seq in english_all]
french_all = [seq.replace("\u202f", "") for seq in french_all]

english_all = [seq.lower() for seq in english_all]
french_all = [seq.lower() for seq in french_all]

print(english_all[0:5])
print(french_all[0:5])

['hi.', 'run!', 'run!', 'who?', 'wow!']
['salut!', 'cours!', 'courez!', 'qui ?', 'ça alors!']


In [5]:
sp_en = spm.SentencePieceProcessor()
sp_en.load("/kaggle/input/french-english-tokenization/other/100embed/1/en.wiki.bpe.vs10000.model")

sp_fr = spm.SentencePieceProcessor()
sp_fr.load("/kaggle/input/french-english-tokenization/other/100embed/1/fr.wiki.bpe.vs10000.model")

en_vocab = sp_en.get_piece_size()
fr_vocab = sp_fr.get_piece_size()

padding_token_id_en = sp_en.pad_id()
print("Padding token ID for english:", padding_token_id_en)

padding_token_id_fr = sp_fr.pad_id()
print("Padding token ID for frencb:", padding_token_id_fr)


print("EN Vocab Size: ", en_vocab)
print("FR Vocab Size: ", fr_vocab)


Padding token ID for english: -1
Padding token ID for frencb: -1
EN Vocab Size:  10000
FR Vocab Size:  10000


In [6]:
english_data = [sp_en.encode_as_ids(seq) for seq in english_all]
french_data = [sp_fr.encode_as_ids(seq) for seq in french_all]

english_data = [[sp_en.piece_to_id("<s>")] + seq + [sp_en.piece_to_id("</s>")] for seq in english_data]
french_data = [[sp_fr.piece_to_id("<s>")] + seq + [sp_fr.piece_to_id("</s>")] for seq in french_data]

print(english_data[0:5])
print(french_data[0:5])

[[1, 40, 9916, 9935, 2], [1, 888, 9960, 2], [1, 888, 9960, 2], [1, 305, 9967, 2], [1, 15, 90, 9960, 2]]
[[1, 1021, 138, 9977, 2], [1, 926, 9977, 2], [1, 844, 6607, 9977, 2], [1, 135, 2340, 2], [1, 9558, 494, 9977, 2]]


In [7]:
# Single Attention Head
class SingleHeadedAttention(nn.Module):
    
    def __init__(self, emb_size, head_size, context_size:int, dropout=0.2):
        super().__init__()
        self.key = nn.Linear(emb_size, head_size, bias=False)
        self.query = nn.Linear(emb_size, head_size, bias=False)
        self.value = nn.Linear(emb_size, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, masked=True, cross=None):
        B,T, C = x.shape
        
        if cross is None:
            k = self.key(x)   # Size (B, T, head_size)
            v = self.value(x) # same thing..
        else:
            k = cross.detach().clone()
            v = cross.detach().clone()
            
        
        q = self.query(x) # same thing..
        
        weight = q @ k.transpose(-2, -1) * (k.shape[-1]**-0.5) # this equation is defined in the original paper and the multiplication part is normalization over each Time Serie in the batch.
        if(masked): weight = weight.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) # This will mask the upper triangle of zeros and turn it into -inf for the softmax func
        weight = F.softmax(weight, dim=-1)
        weight = self.dropout(weight)

        v = self.value(x)
        
        out = weight @ v
        
        return out

In [8]:
class MultiHeadedAttention(nn.Module):
    
    def __init__(self, embed_size, num_heads, head_size, context_size, dropout=0.2):
        super().__init__()
        self.heads = nn.ModuleList([SingleHeadedAttention(embed_size,head_size, context_size, dropout) for _ in range(num_heads)])
        self.linear = nn.Linear(head_size*num_heads, embed_size)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, cross=None):
        if cross is not None:
            B, T, _ = cross.shape
            cross = cross.view(B, T, self.num_heads, self.head_size).transpose(1, 2)
        
        out = torch.cat([h(x, masked, cross[:, i] if cross is not None else None) for i, h in enumerate(self.heads)], dim=-1)

        out = self.linear(out)
        out = self.dropout(out)
        return out

In [9]:
class FeedForward(nn.Module):
    
    def __init__(self, emb_size, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
        nn.Linear(emb_size, emb_size*4),
        nn.ReLU(),
        nn.Linear(emb_size*4, emb_size),
        nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.net(x)

In [10]:
class EncoderTransformerBlock(nn.Module):
    
    def __init__(self, embed_size, num_heads, context_size, dropout=0.2):
        super().__init__()
        head_size = embed_size // num_heads
        self.norm1 = nn.LayerNorm(embed_size)
        self.attention = MultiHeadedAttention(embed_size, num_heads, head_size, context_size, dropout)
        self.norm2 = nn.LayerNorm(embed_size)
        self.ff = FeedForward(embed_size, dropout)
        
    def forward(self, x):
        
        x_norm1 = self.norm1(x)
        x = x + self.attention(x_norm1, masked=True)
        x_norm2 = self.norm2(x)
        x = x + self.ff(x_norm2)
        
        return x
        

In [11]:
class DecoderTransformerBlock(nn.Module):
    
    def __init__(self,embed_size, num_heads, context_size, dropout=0.2):
        super().__init__()
        
        head_size = embed_size // num_heads
        self.norm1 = nn.LayerNorm(embed_size)
        self.self_attention = MultiHeadedAttention(embed_size, num_heads, head_size, context_size, dropout)
        self.norm2 = nn.LayerNorm(embed_size)
        self.cross_attention = MultiHeadedAttention(embed_size, num_heads, head_size, context_size, dropout)
        self.norm3 = nn.LayerNorm(embed_size)
        self.ff = FeedForward(embed_size, dropout)
        
    def forward(self, decoder_input, encoder_output):
        
        dec_norm1 = self.norm1(decoder_input)
        self_atten_out = self.self_attention(self.dec_norm1)
        dec_out_1 = decoder_input + self_atten_out
        
        dec_norm_2 = self.norm2(dec_out_1)
        cross_atten_out = self.cross_attention(dec_norm_2, cross=encoder_output)
        dec_out_2 = dec_out_1 + cross_atten_out
        
        dec_out = dec_out_2 + self.ff(dec_out_2)
        
        return dec_out


In [12]:
class Encoder(nn.Module):
    
    def __init__(self,transformer_count , vocab_size, embed_size, num_heads, context_size, dropout=0.2):
        super().__init__()
        
        self.token_embed = nn.Embedding(vocab_size, embed_size)
        self.pos_embed = nn.Embedding(context_size, embed_size)
        self.transformers = nn.Sequential(*[EncoderTransformerBlock(embed_size, num_heads, context_size, dropout=0.2) for _ in range(transformer_count)])
        
    def forward(self, x):
        B, T = x.shape
        
        tk_emb = self.token_embed(x)
        pos_emb = self.pos_embed(x)
        
        x = tk_emb + pos_emb
        x = self.transformers(x)
        
        return x

In [13]:
class Decoder(nn.Module):
    
    def __init__(self, transformer_count, vocab_size, embed_size, num_heads, context_size, dropout=0.2):
        
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_size)
        self.pos_embed = nn.Embedding(context_size, embed_size)
        self.transformers = nn.Sequential(*[DecoderTransformerBlock(embed_size, num_heads, context_size, dropout)])
    
    def forward(self, decoder_input, encoder_output):
        
        B, T = x.shape
        
        tk_emb = self.token_embed(decoder_input)
        pos_emb = self.pos_embed(decoder_input)
        
        x = tk_emb + pos_emb
        
        x = self.transformers(x, encoder_output)
        
        return x
        

In [14]:
class FrenchEnglishTransformer(nn.Module):
    
    def __init__(self, transformer_count , vocab_size_en, vocab_size_fr, embed_size, num_heads, context_size, dropout=0.2):
        super().__init__()
        self.encoder = Encoder(transformer_count, vocab_size_fr, embed_size, num_heads, context_size, dropout)
        self.decoder = Decoder(transformer_count, vocab_size_en, embed_size, num_heads, context_size, dropout)
        self.norm_final = nn.LayerNorm(embed_size)
        self.linear_final = nn.Linear(embed_size, vocab_size_en)
        
    def forward(self, french, english, target_english=None):
        
        encoder_out = self.encoder(french)
        decoder_out = self.decoder(english, encoder_out)
        
        out = self.norm_final(decoder_out)
        out = self.linear_final(out)
        
        return out
        
        

In [15]:
model = FrenchEnglishTransformer(TRANSFORMER_COUNT, en_vocab, fr_vocab, EMBEDDING_SIZE, NUM_HEADS, CONTEXT_SIZE, DROPOUT)
m = model.to(DEVICE)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

5.366928 M parameters
