In [None]:
# Name:     Muneel Haider , Muhammad Abdullah
# Roll No.:   21I-0640    ,     21I-0643
# Section:                D
# NLP-Project

import random
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# file paths for the provided datasets. 
urduArabDev = r"D:\Work\Academic Work\Academic_Work\NLP\Project\urd_Arab.dev"
engDev = r"D:\Work\Academic Work\Academic_Work\NLP\Project\eng_Latn.dev"
urduArabDevTest = r"D:\Work\Academic Work\Academic_Work\NLP\Project\urd_Arab.devtest"
engDevTest = r"D:\Work\Academic Work\Academic_Work\NLP\Project\eng_Latn.devtest"

# Preprocessing data 
with open(urduArabDev, encoding='utf-8') as uDev, \
     open(engDev, encoding='utf-8') as eDev, \
     open(urduArabDevTest, encoding='utf-8') as uDevTest, \
     open(engDevTest, encoding='utf-8') as eDevTest:
    
    uSentence = uDev.readlines() + uDevTest.readlines() # Urdu sentences from data
    eSentence = eDev.readlines() + eDevTest.readlines() # English sentence from data

# Shuffling both english and urdu sentences in pairs. 
data = list(zip(uSentence, eSentence))
random.shuffle(data)

# Splitting the data according to the given constraints. 
trainData, tempData = train_test_split(data, test_size=0.3, random_state=42)
valData, testData = train_test_split(tempData, test_size=0.5, random_state=42)

# seperating data after splitting
uTrain, eTrain = zip(*trainData) # seperating data for training (both urdu and english)
uVal, eVal = zip(*valData) # for validation
uTest, eTest = zip(*testData) # for testing

# Vocabulary class for text data given in dataset
class Vocabulary:
    
    def __init__(self):
        
        self.wordIndex = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3} # text data for start, end and unknown words in dataset. 
        self.indexWord = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"} # reverse
        self.wCount = 4 

    def sentenceAdd(self, sentence):
        
        for word in sentence.split(): # splitting sentence to words
            self.wordAdd(word) # adding word to vocabulary 

    def wordAdd(self, word):
        
        if word not in self.wordIndex: # if the word is not in vocabulary
        
            self.wordIndex[word] = self.wCount
            self.indexWord[self.wCount] = word
            self.wCount += 1

    def encode(self, sentence):
        return [self.wordIndex.get(word, self.wordIndex["<UNK>"]) for word in sentence.split()]

    def decode(self, indices):
        return " ".join([self.indexWord[idx] for idx in indices if idx not in (0, 1, 2)])

# Initialize vocabularies
uVocab = Vocabulary() # urdu Vocabulary 
eVocab = Vocabulary() # english Vocabulary

# Adding sentences in their respective vocabulary
for sentence in uTrain:
    uVocab.sentenceAdd(sentence) 

for sentence in eTrain:
    eVocab.sentenceAdd(sentence)

# Class for translation of data. 
class dataTranslate(Dataset):
    
    # encoding source and target sentences. 
    def __init__(self, sSentence, tSentence, sVocab, tVOcab):
    
        self.sSentence = [sVocab.encode(s) for s in sSentence]
        self.tSentence = [tVOcab.encode(s) for s in tSentence]

    # sum of data data samples.
    def __len__(self):
        return len(self.sSentence)

    def __getitem__(self, idx):
    
        src = torch.tensor(self.sSentence[idx] + [uVocab.wordIndex["<EOS>"]])
        tgt = torch.tensor(self.tSentence[idx] + [eVocab.wordIndex["<EOS>"]])
        return src, tgt

# Function for padding sequences. 
def collate(batch):
    
    sBatch, tBatch = zip(*batch) # seperating source and target sentences. 
    sBatch = pad_sequence(sBatch, padding_value=uVocab.wordIndex["<PAD>"], batch_first=True)
    tBatch = pad_sequence(tBatch, padding_value=eVocab.wordIndex["<PAD>"], batch_first=True)
    
    return sBatch, tBatch

# Datasets for preperation
trainData = dataTranslate(uTrain, eTrain, uVocab, eVocab)
valData = dataTranslate(uVal, eVal, uVocab, eVocab)
testData = dataTranslate(uTest, eTest, uVocab, eVocab)


# for training and evaluation
trainLoad = DataLoader(trainData, batch_size=64, shuffle=True, collate_fn=collate) # training
valLoad = DataLoader(valData, batch_size=64, shuffle=False, collate_fn=collate) # validating 
testLoad = DataLoader(testData, batch_size=64, shuffle=False, collate_fn=collate) # testing

# Encode class for LSTM 
class encoder(nn.Module):
    def __init__(self, iLayer, embLayer, hiddenLayer, dropout):
        
        super().__init__() # parent class
        self.embedding = nn.Embedding(iLayer, embLayer)
        self.rnn = nn.LSTM(embLayer, hiddenLayer, num_layers=2, bidirectional=True, dropout=dropout, batch_first=True)
        self.fc_hidden = nn.Linear(hiddenLayer * 2, hiddenLayer)
        self.fc_cell = nn.Linear(hiddenLayer * 2, hiddenLayer)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        
        embed = self.dropout(self.embedding(src))
        results, (hide, cell) = self.rnn(embed) # LSTM on embedded input. 
        hide = torch.tanh(self.fc_hidden(torch.cat((hide[-2], hide[-1]), dim=1))).unsqueeze(0)
        cell = torch.tanh(self.fc_cell(torch.cat((cell[-2], cell[-1]), dim=1))).unsqueeze(0)
        return results, hide, cell

# Attention function for decoding
class Attention(nn.Module):
    
    def __init__(self, encHiddenLayer, decHiddenLayer): 
        super().__init__() # parent class
        self.attn = nn.Linear(encHiddenLayer * 2 + decHiddenLayer, decHiddenLayer)
        self.v = nn.Parameter(torch.rand(decHiddenLayer))

    def forward(self, encodeResult, hide):
        hide = hide[-1].unsqueeze(1).repeat(1, encodeResult.size(1), 1)
        energy = torch.tanh(self.attn(torch.cat((hide, encodeResult), dim=2)))
        attention = torch.sum(self.v * energy, dim=2) # using dot product 
        return torch.softmax(attention, dim=1)

# Decoder function using attention 
class Decoder(nn.Module):
    
    def __init__(self, dimResult, dimEmbed, enc_hid_dim, dec_hid_dim, dropout, attention):
        
        super().__init__()
        self.embedding = nn.Embedding(dimResult, dimEmbed) # targetting tokens from embedding layer. 
        self.rnn = nn.LSTM(dimEmbed + enc_hid_dim * 2, dec_hid_dim, num_layers=2, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(dec_hid_dim + enc_hid_dim * 2, dimResult)
        self.attention = attention
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encOutput):
        
        if hidden.size(0) == 1:  # If only one layer, repeat it
            hidden = hidden.repeat(2, 1, 1)
            cell = cell.repeat(2, 1, 1)
        
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input)) 
        aWeights = self.attention(encOutput, hidden) # attention weights 
        context = torch.bmm(aWeights.unsqueeze(1), encOutput)
        rnnInp = torch.cat((embedded, context), dim=2)
        results, (hidden, cell) = self.rnn(rnnInp, (hidden, cell))
        predictions = self.fc(torch.cat((results.squeeze(1), context.squeeze(1)), dim=1)) # generating predictions. 
        
        return predictions, hidden, cell



# Function for sequence to sequence learning
class Seq2Seq(nn.Module):
    def __init__(self, enc, dec, device):
        
        super().__init__()
        self.encoder = enc # encoder
        self.decoder = dec # decoder
        self.device = device # for running on cpu

    def forward(self, src, trg):
        
        targetLen = trg.shape[1]
        batchSize = trg.shape[0]
        results = torch.zeros(batchSize, targetLen, self.decoder.fc.out_features).to(self.device)
        encOutput, hidden, cell = self.encoder(src)
        input = trg[:, 0] # initialising decoding 
        
        for t in range(1, targetLen):
        
            output, hidden, cell = self.decoder(input, hidden, cell, encOutput)
            results[:, t, :] = output
            input = output.argmax(1) # highest scording token as next input using argmax. 
        
        return results


# Initializing all the parameters for the model. 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # for both cpu and gpu (one member had gpu and one had cpu).
iVocabSize = len(uVocab.wordIndex)
oVocabSize = len(eVocab.wordIndex)
encEmbed = 256
decEmbed = 256
hiddenLayer = 512
dropout = 0.3

attention = Attention(hiddenLayer, hiddenLayer)
encoder = encoder(iVocabSize, encEmbed, hiddenLayer, dropout)
decoder = Decoder(oVocabSize, decEmbed, hiddenLayer, hiddenLayer, dropout, attention)
model = Seq2Seq(encoder, decoder, device).to(device) # encoder decoder for sequence model. 

optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=eVocab.wordIndex["<PAD>"]) # loss function

# Model Training function. 
def epochTraining(model, iterate, optimizer, criterion, clip):
    
    model.train()
    epochLoss = 0
    
    for src, trg in iterate:
        
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg) # forward pass.

        output = output[:, 1:].contiguous().view(-1, oVocabSize)
        trg = trg[:, 1:].contiguous().view(-1)
        loss = criterion(output, trg) # calculating loss. 
        loss.backward() # backward pass. 
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epochLoss += loss.item()
    
    return epochLoss / len(iterate) # average loss for epoch

# For evaluating model. 
def epochEval(model, iterate, criterion):
    
    model.eval()
    epochLoss = 0
    
    with torch.no_grad():
        
        for src, trg in iterate:
            
            src, trg = src.to(device), trg.to(device)
            output = model(src, trg) # forward pass

            output = output[:, 1:].contiguous().view(-1, oVocabSize)
            trg = trg[:, 1:].contiguous().view(-1)
            loss = criterion(output, trg)
            epochLoss += loss.item()
    
    return epochLoss / len(iterate) # average loss

# Training the sequence model for epochs
epochCount = 20
clip = 1

for epoch in range(epochCount):
    
    tLoss = epochTraining(model, trainLoad, optimizer, criterion, clip) # for one epoch
    valLoss = epochEval(model, valLoad, criterion)
    print(f"Epoch: {epoch+1} | Training Loss: {tLoss:.3f} | Validation Loss: {valLoss:.3f}")
    print("\n")

# Evaluation prediction
def evalPredictions(model, dataset):
    
    model.eval()
    predictions = []
    
    with torch.no_grad():
    
        for src, _ in DataLoader(dataset, batch_size=1):
    
            src = src.to(device)
            output = model(src, torch.zeros_like(src).to(device)) # output sequence
            tokenPredict = output.argmax(-1).cpu().numpy()[0] # high-score token
            predictions.append(eVocab.decode(tokenPredict))
    
    return predictions


testPredictions = evalPredictions(model, testData)

with open("predictions.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(testPredictions))

with open("references.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(eTest))

# Run the following commands in cmd for Bleu-Results:
# perl multi-bleu.perl predictions.txt < references.txt

Epoch: 1 | Training Loss: 7.994 | Validation Loss: 7.655


Epoch: 2 | Training Loss: 7.273 | Validation Loss: 7.780


Epoch: 3 | Training Loss: 7.108 | Validation Loss: 7.944


Epoch: 4 | Training Loss: 7.010 | Validation Loss: 8.001


Epoch: 5 | Training Loss: 6.928 | Validation Loss: 8.090


Epoch: 6 | Training Loss: 6.836 | Validation Loss: 8.144


Epoch: 7 | Training Loss: 6.740 | Validation Loss: 8.233


Epoch: 8 | Training Loss: 6.636 | Validation Loss: 8.320


Epoch: 9 | Training Loss: 6.520 | Validation Loss: 8.381


Epoch: 10 | Training Loss: 6.406 | Validation Loss: 8.467


Epoch: 11 | Training Loss: 6.249 | Validation Loss: 8.531


Epoch: 12 | Training Loss: 6.094 | Validation Loss: 8.624


Epoch: 13 | Training Loss: 5.966 | Validation Loss: 8.787


Epoch: 14 | Training Loss: 5.838 | Validation Loss: 8.836


Epoch: 15 | Training Loss: 5.661 | Validation Loss: 8.924


Epoch: 16 | Training Loss: 5.459 | Validation Loss: 9.066


Epoch: 17 | Training Loss: 5.220 | Validation Los