In [38]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [39]:
sentence_en = "I love AI ."
sentence_fr = "J' adore l'IA ."

word_map_en = {"<pad>": 0, "I": 1, "love": 2, "AI": 3, ".": 4}
word_map_fr = {"<pad>": 0, "J'": 1, "adore": 2, "l'IA": 3, ".": 4}


In [40]:
#tokenization
def tokenize(text,word_map):
    return torch.tensor([word_map[word] for word in text.split()])

input_tensor=tokenize(sentence_en,word_map_en)
target_tensor=tokenize(sentence_fr,word_map_fr)
input_tensor,target_tensor

(tensor([1, 2, 3, 4]), tensor([1, 2, 3, 4]))

In [53]:
#positional encoding
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_len=5000):
        super(PositionalEncoding,self).__init__()
        self.encoding=torch.zeros(max_len,d_model)
        position=torch.arange(0,max_len,dtype=torch.float32).unsqueeze(1)
        div_term=torch.exp(torch.arange(0,d_model,2).float()*-(torch.log(torch.tensor(10000.0))/d_model))
        self.encoding[:,0::2]=torch.sin(position*div_term)
        self.encoding[:,1::2]=torch.cos(position*div_term)
        self.encoding=self.encoding.unsqueeze(0)
    def forward(self,x):
        return x+self.encoding[:,:x.size(1)]

In [112]:
#multi attention mechanism
class MHA(nn.Module):
    def __init__(self,d_model,num_heads):
        super(MHA,self).__init__()
        self.num_heads=num_heads
        self.d_model=d_model
        self.d_k=d_model//num_heads
        self.d_v=d_model//num_heads
        self.query=nn.Linear(d_model,d_model)
        self.key=nn.Linear(d_model,d_model)
        self.value=nn.Linear(d_model,d_model)
        self.fc=nn.Linear(d_model,d_model)
    def forward(self,x,mask=None):
        batch_size,seq_len,_=x.size()
        q=self.query(x).view(batch_size,seq_len,self.num_heads,self.d_k).transpose(1,2)
        k=self.key(x).view(batch_size,seq_len,self.num_heads,self.d_k).transpose(1,2)
        v=self.value(x).view(batch_size,seq_len,self.num_heads,self.d_v).transpose(1,2)
        attn_scores=torch.matmul(q,k.transpose(-2,-1))/torch.sqrt(torch.tensor(self.d_k,dtype=torch.float32))
        if mask is not None:
            attn_scores=attn_scores.masked_fill(mask==0,float('-inf'))
        attn_weights=F.softmax(attn_scores,dim=-1)
        attn_output=torch.matmul(attn_weights,v).transpose(1,2).contiguous().view(batch_size,seq_len,self.d_model)
        return self.fc(attn_output)

In [113]:
#feed forward
class FeedForward(nn.Module):
    def __init__(self,d_model,d_ff=512):
        super(FeedForward,self).__init__()
        self.fc1=nn.Linear(d_model,d_ff)
        self.fc2=nn.Linear(d_ff,d_model)
    def forward(self,x):
        return self.fc2(F.relu(self.fc1(x)))
        

In [114]:
#encoder layer
class EncoderLayer(nn.Module):
    def __init__(self,d_model,num_heads,d_ff=512):
        super(EncoderLayer,self).__init__()
        self.mha=MHA(d_model,num_heads)
        self.feedforward=FeedForward(d_model,d_ff)
        self.norm1=nn.LayerNorm(d_model)
        self.norm2=nn.LayerNorm(d_model)
    def forward(self,x,mask=None):
        attn_outputs=self.mha(x,mask)
        x=self.norm1(x+attn_outputs)
        ff_outputs=self.feedforward(x)
        x=self.norm2(x+ff_outputs)
        return x

In [115]:
#decoder
class DecoderLayer(nn.Module):
    def __init__(self,d_model,num_heads,d_ff=512):
        super(DecoderLayer,self).__init__()
        self.mha1=MHA(d_model,num_heads)
        self.mha2=MHA(d_model,num_heads)
        self.norm1=nn.LayerNorm(d_model)
        self.norm2=nn.LayerNorm(d_model)
        self.norm3=nn.LayerNorm(d_model)
        self.feedforward=FeedForward(d_model,d_ff)
    def forward(self,x,encoder_output,tgt_mask=None,src_mask=None):
        attn_output1=self.mha1(x,mask=src_mask)
        x=self.norm1(x+attn_output1)
        attn_output2=self.mha2(x,mask=src_mask)
        x=self.norm1(x+attn_output2)
        ff_outputs=self.feedforward(x)
        x=self.norm3(x+ff_outputs)
        return x

In [116]:
class Transformer(nn.Module):
    def __init__(self,vocab_size,d_model,num_heads,num_encoder_layers,num_decoder_layers,max_len=5000):
        super(Transformer,self).__init__()
        self.embedding=nn.Embedding(vocab_size,d_model)
        self.encoding=PositionalEncoding(d_model,max_len)
        self.encoder_layers=nn.ModuleList([EncoderLayer(d_model,num_heads) for _ in range(num_encoder_layers)])
        self.decoder_layers=nn.ModuleList([DecoderLayer(d_model,num_heads) for _ in range(num_decoder_layers)])
        self.fc=nn.Linear(d_model,vocab_size)
    def forward(self,src,tgt,tgt_mask=None):
        src=self.encoding(self.embedding(src))
        tgt=self.encoding(self.embedding(tgt))
        
        encoder_output=src
        for layers in self.encoder_layers:
            encoder_output=layers(encoder_output)
            
        decoder_output=tgt
        for layers in self.decoder_layers:
            decoder_output=layers(decoder_output,encoder_output,tgt_mask=tgt_mask)
            
        output=self.fc(decoder_output)
        return output


In [117]:
def translate(input_sentence,word_map_en,word_map_fr,transformer):
    input_tensor=tokenize(input_sentence,word_map_en).unsqueeze(0)
    tgt_mask=torch.tril(torch.ones((input_tensor.size(1),input_tensor.size(1)))).unsqueeze(0).unsqueeze(0)
    target_tensor=torch.zeros((1,input_tensor.size(1)),dtype=torch.long)
    output=transformer(input_tensor,target_tensor,tgt_mask)
    softmax_output=F.softmax(output,dim=-1)
    predicted_tokens=torch.argmax(softmax_output,dim=-1)
    reverse_word_map_fr={v:k for k,v in word_map_fr.items()}
    translated_sentence=[reverse_word_map_fr[token.item()] for token in predicted_tokens[0] if token!=0]
    return " ".join(translated_sentence)

In [123]:
vocab_size_en=len(word_map_en)
vocab_size_fr=len(word_map_fr)
d_model=128
num_heads=8
num_encoder_layers=6
num_decoder_layers=6
transformer=Transformer(vocab_size_en,d_model,num_heads,num_encoder_layers,num_decoder_layers)
outputs=translate(sentence_en,word_map_en,word_map_fr,transformer)
outputs

'adore adore adore adore'