In [3]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm 
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.optim as optim

# DATASET LOADING

In [2]:
!wget https://object.pouta.csc.fi/OPUS-UNPC/v1.0/moses/en-fr.txt.zip

--2024-09-18 04:39:23--  https://object.pouta.csc.fi/OPUS-UNPC/v1.0/moses/en-fr.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2657208676 (2.5G) [application/zip]
Saving to: ‘en-fr.txt.zip.1’

en-fr.txt.zip.1       1%[                    ]  30.81M  7.87MB/s    eta 6m 58s ^C


In [22]:
!unzip en-fr.txt.zip

Archive:  en-fr.txt.zip
  inflating: README                  
  inflating: LICENSE                 
  inflating: UNPC.en-fr.en           
  inflating: UNPC.en-fr.fr           
  inflating: UNPC.en-fr.xml          


In [3]:
with open('UNPC.en-fr.en', 'r',  encoding='utf-8') as file:
    english = file.readlines()

len(english)

30340652

In [4]:
with open('UNPC.en-fr.fr', 'r',  encoding='utf-8') as file:
    french = file.readlines()

len(french)

30340652

In [10]:
french[:10]

['NATIONS\n',
 'E\n',
 'Conseil Économique\n',
 'Distr.\n',
 'GÉNÉRALE\n',
 '2 février 1999\n',
 'Original : FRANÇAIS\n',
 'COMMISSION ÉCONOMIQUE POUR L &apos; EUROPE\n',
 'COMITÉ DES TRANSPORTS INTÉRIEURS\n',
 'Groupe de travail de la construction des véhicules\n']

In [14]:
combined=[]
combined.append(english[:5000])
combined.append(french[:5000])


In [4]:
english[:10]

['UNITED NATIONS\n',
 'E\n',
 'Economic and Social Council\n',
 'Distr.\n',
 'GENERAL\n',
 '2 February 1999\n',
 'ENGLISH Original: FRENCH\n',
 'ECONOMIC COMMISSION FOR EUROPE\n',
 'INLAND TRANSPORT COMMITTEE\n',
 'Working Party on the Construction of Vehicles\n']

In [5]:
french[:10]

['NATIONS\n',
 'E\n',
 'Conseil Économique\n',
 'Distr.\n',
 'GÉNÉRALE\n',
 '2 février 1999\n',
 'Original : FRANÇAIS\n',
 'COMMISSION ÉCONOMIQUE POUR L &apos; EUROPE\n',
 'COMITÉ DES TRANSPORTS INTÉRIEURS\n',
 'Groupe de travail de la construction des véhicules\n']

# Data Processing

In [7]:
pip install sentencepiece


Note: you may need to restart the kernel to use updated packages.


In [8]:
# Train SentencePiece tokenizer (or load a pre-trained one)
sp = spm.SentencePieceProcessor()
# sp.load('path_to_pretrained_sentencepiece_model.model')

# Tokenize the source and target sentences
source_tokens = sp.encode(english[:5000], out_type=int)
target_tokens = sp.encode(french[:5000], out_type=int)


# Embedding

In [4]:
class Embeddings(nn.Module):
    def __init__(self, latent_size, input_size ):
        super(Embeddings, self).__init__()
        self.project=nn.Linear(input_size, latent_size)
        self.l_size=latent_size
        
    def forward(self, tokens):
        b, n, s=tokens.shape
        token_embeddings=self.project.forward(tokens)
        
        #positional Encoding
        positions=torch.arange(n).unsqueeze(1)
        angs = 10000**(torch.arange(self.l_size)/self.l_size).float()
        pos_enc=torch.zeros(b, n, self.l_size)
        pos_enc[:,0::2]=torch.sin(positions[0::2]/angs)
        pos_enc[:, 1::2]=torch.cos(positions[1::2]/angs)
        token_embeddings+=pos_enc
        
        return token_embeddings
        

In [5]:
class MaskedSelfAttention(nn.Module):
    def __init__(self, latent_size, dq=200, dv=200, dk=200):
        super(MaskedSelfAttention, self).__init__()
        self.l_size=latent_size
        self.dq=dq
        self.dv=dv
        self.dk=dk
        self.Wq=nn.Linear(self.l_size, self.dq  )
        self.Wv=nn.Linear(self.l_size, self.dv)
        self.Wk=nn.Linear(self.l_size, self.dk )
        self.softmax=nn.Softmax()
        
    def forward(self, embeddings):
        b,n, e=embeddings.shape
        Q=self.Wq.forward(embeddings).view(b,n, self.heads, self.head_dim).transpose(1,2)
        V=self.Wv.forward(embeddings).view(b,n, self.heads, self.head_dim).transpose(1,2)
        K=self.Wk.forward(embeddings).view(b,n, self.heads, self.head_dim).transpose(1,2)
        mask=torch.triu(torch.ones(n,n), diagonal=1)
        A=torch.matmul(Q, K.transpose(-2, -1)) / self.dk**0.5
        A += (mask * -1e9)
        Ap=nn.functional.softmax(A, dim=-1)
        
        att_vector=torch.matmul(Ap, V)
        att_vector=att_vector.transpose(1,2).contiguous().view(b, n, self.l_size)
        return att_vector 

In [6]:
class SelfAttention(nn.Module):
    def __init__(self, latent_size, dq=200, dk=200, dv=200, n_heads=4):
        super(SelfAttention, self).__init__()
        self.l_size=latent_size
        self.dq=dq
        self.dv=dv
        self.dk=dk
        
        self.heads=n_heads
        assert self.l_size%self.heads==0
        self.head_dim=self.l_size//self.heads
        self.Wq=nn.Linear(self.l_size, self.dq  )
        self.Wv=nn.Linear(self.l_size, self.dv)
        self.Wk=nn.Linear(self.l_size, self.dk )
        
    def forward(self, embeddings):
        # Query, Key and Value
        b,n, e=embeddings.shape
        Q=self.Wq.forward(embeddings).view(b,n, self.heads, self.head_dim).transpose(1,2)
        V=self.Wv.forward(embeddings).view(b,n, self.heads, self.head_dim).transpose(1,2)
        K=self.Wk.forward(embeddings).view(b,n, self.heads, self.head_dim).transpose(1,2)
            
        
        # Attention Score size=(b,n,n)
        A=torch.matmul(Q, K.transpose(-2, -1)) / self.dk**0.5
                
        # Attention probability score
        Ap=nn.functional.softmax(A, dim=-1)
        
        att_vector=torch.matmul(Ap, V)
        att_vector=att_vector.transpose(1,2).contiguous().view(b, n, self.l_size)
        return att_vector 
    

In [7]:
class CrossAttention(nn.Module):
    def __init__(self, latent_size, dq=200, dv=200, dk=200, n_heads=4):
        super(CrossAttention, self).__init__()
        self.l_size=latent_size
        self.dq=dq
        self.dv=dv
        self.dk=dk
        self.heads=n_heads
        assert self.l_size%self.heads==0
        self.head_dim=self.l_size//self.heads
        self.Wq=nn.Linear(self.l_size, self.dq  )
        self.Wv=nn.Linear(self.l_size, self.dv)
        self.Wk=nn.Linear(self.l_size, self.dk )
        self.softmax=nn.Softmax()
    def forward(self, Decoder_embeddnigs, Encoder_output):
        b,n, e=Decoder_embeddnigs.shape
        # Query, Key and Value
        Q=self.Wq.forward(Decoder_embeddnigs).view(b,n, self.heads, self.head_dim).transpose(1,2)
        V=self.Wv.forward(Encoder_output).view(b,n, self.heads, self.head_dim).transpose(1,2)
        K=self.Wk.forward(Encoder_output).view(b,n, self.heads, self.head_dim).transpose(1,2)
        
        # Attention Score size=(b,n,n)
        A=torch.matmul(Q, K.transpose(-2, -1)) / self.dk**0.5
                
        # Attention probability score
        Ap=nn.functional.softmax(A, dim=-1)
        
        att_vector=torch.matmul(Ap, V)
        att_vector=att_vector.transpose(1,2).contiguous().view(b, n, self.l_size)
    
        return att_vector 
        

In [8]:
class Encoder(nn.Module):
    def __init__(self, input_size, latent_size ):
        super(Encoder, self).__init__()
        self.norm=nn.LayerNorm()
        self.text_embeddings=Embeddings(latent_size, input_size)
        self.feedforward=nn.Sequential(nn.Linear(latent_size, latent_size), 
                                       nn.GELU(),
                                       nn.Linear(latent_size, latent_size))
        self.attention=SelfAttention(latent_size)
    def forward(self, data1):
        embeddings=self.text_embeddings.forward(data1)
        att_vector=self.attention.forward(embeddings)
        a1=att_vector+embeddings
        a1=self.norm(a1)
        ff=self.feedforward.forward(a1)
        a2=a1+ff
        a2=self.norm(a2)
        return a2
        

In [9]:
class Decoder(nn.Module):
    def __init__(self, input_size, latent_size):
        super(Decoder, self).__init__()
        self.text_embeddings=Embeddings(latent_size, input_size)
        self.norm=nn.LayerNorm()
        self.feedforward=nn.Sequential(nn.Linear(latent_size, latent_size), 
                                       nn.GELU(),
                                       nn.Linear(latent_size, latent_size))
        self.linear=nn.Linear()
        self.softmax=nn.Softmax()
        self.cross=CrossAttention(latent_size)
        self.masked=MaskedSelfAttention(latent_size)
    def forward(self, data2, encoder_output):
        embeddings=self.text_embeddings.forward(data2)
        att_vector=self.masked.forward(embeddings)
        a1=att_vector+embeddings
        a1=self.norm(a1)
        cross_attn=self.cross.forward(a1, encoder_output)
        a2=self.norm(a1+cross_attn)
        ff=self.feedforward.forward(a2)
        a3=self.norm(a2+ff)
        output=self.softmax(self.linear(a3), dim=-1)
        
        return output

In [11]:
class Transformers(nn.Module):
    def __init__(self,input_size, latent_size ):
        super(Transformers, self).__init__()
        self.encoder=Encoder(input_size, latent_size)
        self.decoder=Decoder(input_size, latent_size)
    def forward(self, data1, data2):
        encoder_output=self.encoder.forward(data1)
        decoder_output=self.decoder.forward(data2, encoder_output)
        
        return decoder_output