In [37]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


/kaggle/input/en-fr-translation-dataset/en-fr.csv


In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import spacy
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset,DataLoader
import time

In [39]:
spacy_eng = spacy.load("en_core_web_sm")
# hyper parameters
device=('cuda' if torch.cuda.is_available() else 'cpu')

In [40]:
class vocabulary():
    def __init__(self,freq_threshold):
        self.itos={0:'PAD',1:'<SOS>',2:'<EOS>',3:'<UNK>'}
        self.stois={'PAD':0,'<SOS>':1,'<EOS>':2,'<UNK>':3}
        self.freq_threshold=freq_threshold
        
    def __len__(self):
        return len(self.stois)
    
    @staticmethod
    def tokenised_text(text):
        return [token.text.lower() for token in spacy_eng.tokenizer(str(text))]
    
    def build_vocabulary(self,sentence_list):
        frequencies={}
        idx=4
        
        for sentence in sentence_list:
            for word in self.tokenised_text(sentence):
                if word not in frequencies:
                    frequencies[word]=1
                    
                else:
                    frequencies[word]+=1
                    
                if frequencies[word]==self.freq_threshold:
                    self.stois[word]=idx
                    self.itos[idx]=word
                    idx+=1
                    
    def numericalise(self,text):
        token_text=self.tokenised_text(text)
        
        return [self.stois[word] if word in self.stois else self.stois['<UNK>'] for word in token_text]
                    


In [41]:

class custom_data_loader(Dataset):
    def __init__(self,file,freq_threshold):
        super().__init__()
        
        self.df=pd.read_csv(file,nrows=100000)
        self.freq_threshold=freq_threshold
#         get target and source sequences
        self.source_seq=self.df.iloc[:,0]
        self.target_seq=self.df.iloc[:,1]
        
        self.vocab_source=vocabulary(freq_threshold=self.freq_threshold)
        self.vocab_target=vocabulary(freq_threshold=self.freq_threshold)
        
        self.vocab_source.build_vocabulary(self.source_seq.tolist())
        self.vocab_target.build_vocabulary(self.target_seq.tolist())
        self.len_vocab_for_model=max(len(self.vocab_source.itos),len(self.vocab_target.itos))
        

        
    def __len__(self):
        return len(self.source_seq)
    
    def __getitem__(self,index):
        source=self.source_seq[index]
        target=self.target_seq[index]
        
        
        numericalized_source = [self.vocab_source.stois["<SOS>"]]
        numericalized_source += self.vocab_source.numericalise(source)
        numericalized_source.append(self.vocab_source.stois["<EOS>"])
        
        numericalized_caption = [self.vocab_target.stois["<SOS>"]]
        numericalized_caption += self.vocab_target.numericalise(target)
        numericalized_caption.append(self.vocab_target.stois["<EOS>"])
        
        return numericalized_source,numericalized_caption

        

In [42]:
        
class MyCollate:
    def __init__(self,pad_idx,fixed_length):
        self.pad_idx=pad_idx
        self.fixed_length=fixed_length
        
    def __call__(self,batch):
        inputs=[source[0]  for source in batch]
        targets=[source[1]  for source in batch]
        
        # Pad inputs and targets using TensorFlow pad_sequences
        inputs = pad_sequences(inputs, maxlen=self.fixed_length, padding='post', truncating='post', value=self.pad_idx)
        targets = pad_sequences(targets, maxlen=self.fixed_length, padding='post', truncating='post', value=self.pad_idx)
        
        # Convert to PyTorch tensors
        inputs = torch.tensor(inputs, dtype=torch.long)
        targets = torch.tensor(targets, dtype=torch.long)
        return inputs, targets
    
def get_loader(
    
    file,
    freq_threshold,
    batch_size=16,
#     num_workers=1,
    shuffle=True,
    pin_memory=True,
    
):
    
    dataset=custom_data_loader(file,freq_threshold)
    
    pad_idx = dataset.vocab_source.stois["PAD"]
    fixed_length=32
    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=shuffle,
        pin_memory=pin_memory,
        collate_fn=MyCollate(pad_idx=pad_idx,fixed_length=fixed_length),
    )
    return loader,dataset.len_vocab_for_model

In [43]:
# parameters
freq_threshold=25
file="/kaggle/input/en-fr-translation-dataset/en-fr.csv"

loader,len_vocab_for_model = get_loader(file=file,freq_threshold=freq_threshold)

    

In [44]:


class encoder(nn.Module):
    def __init__(self,length_of_vocab):
        super().__init__()
        self.length_of_vocab=length_of_vocab
        self.embedding_layer=nn.Embedding(self.length_of_vocab,3,padding_idx=0)
        self.lstm_layer1=nn.LSTM(3,4,1)
        
    def forward(self,x):
        x=self.embedding_layer(x)
        output,hidden_states =self.lstm_layer1(x)
        return output,hidden_states
    


class decoder(nn.Module):
    def __init__(self,length_of_vocab):
        super().__init__()
        self.length_of_vocab=length_of_vocab
        self.encoder_obj=encoder(length_of_vocab)  
        self.embedding_layer=nn.Embedding(self.length_of_vocab,3,padding_idx=0)
        self.lstm_layer=nn.LSTM(3,4,1)
        self.linear=nn.Linear(4,self.length_of_vocab)
        
    def forward(self,source_seq,target_seq):
        
        output,hidden_states=self.encoder_obj(source_seq)
        embeddings=self.embedding_layer(target_seq)
     
        output_dec,_=self.lstm_layer(embeddings,hidden_states)
        final=self.linear(output_dec)
        return final
    

        
    
    

In [45]:
model=decoder(len_vocab_for_model).to(device)
lr=0.05
epochs=5
optimiser=optim.Adam(model.parameters(),lr=lr)
criteria=nn.CrossEntropyLoss()

In [46]:
losses_list=[]
model.train()
for epoch in range(epochs):
    start = time.time()
    for source_seq,target_seq in loader:
               
        source_seq=source_seq.to(device)
        target_seq=target_seq.to(device)
        
        output=model.forward(source_seq,target_seq)
        
        # Reshape logits to (batch_size * sequence_length, num_classes)
        logits = output.view(-1, len_vocab_for_model)  # shape: (30, 5)

        # Reshape labels to (batch_size * sequence_length)
        labels = target_seq.view(-1)  # shape: (30,)

        loss=criteria(logits,labels)
        losses_list.append(loss.item())

        optimiser.zero_grad()    
        loss.backward()
        optimiser.step()
    
    
    mean_loss = sum(losses_list) / len(losses_list)
    end = time.time()
    print(f'Loss after {epoch} is {mean_loss} and it took {end-start} seconds')    

  self.pid = os.fork()
  self.pid = os.fork()


Loss after 0 is 1.9860015284347534 and it took 39.27575397491455 seconds
Loss after 1 is 1.6469195326042176 and it took 39.05873894691467 seconds
Loss after 2 is 1.4575135706710816 and it took 39.03929543495178 seconds
Loss after 3 is 1.3296277607572078 and it took 38.998775482177734 seconds
Loss after 4 is 1.2363820798521041 and it took 39.031928062438965 seconds
