# Final Year Project 
#### Topic: MLNN (AI Code Generator)
### Made By: Jason Angrico

### Loading Dataset

In [1]:
file = open("codeDatasetTest.txt", "r", encoding="utf-8")
fileEntries = file.readlines()

In [2]:
fileEntries[:7]

['# write a python function to add two user provided numbers and return the sum\n',
 'def add_two_numbers(num1, num2):\n',
 '    sum = num1 + num2\n',
 '    return sum\n',
 '\n',
 '\n',
 '# write a python program to add two numbers \n']

In [3]:
# Init an empty dictionary and temp dictionary; QnADict and tempDict respectively
QnADict = []
tempDict = None

# Loop through the file, if entry starts with a # indicating a new question:
for entry in fileEntries:
    if entry.startswith("#"):
        # If tempDict is not empty (contains the previous QnA)
        if tempDict:
            # Join the the list of answer lines into a single string and appends it to QnADict
            tempDict['answer'] = ''.join(tempDict['answer'])
            QnADict.append(tempDict)
        # Init a new tempDict to hold the newly detected question
        tempDict = {"question": entry[1:], "answer": []}
    else:
        #If it does not start with a #, meaning its an answer, append it to answer list
        tempDict["answer"].append(entry)

# Append the last entry into QnADict to ensure all QnAs are included
if tempDict:
    tempDict['answer'] = ''.join(tempDict['answer'])
    QnADict.append(tempDict)

In [4]:
i = 0
for tempDict in QnADict:
    print("\nQuestion ", i+1, ":")
    i+=1
    print(tempDict['question'][1:])
    print(tempDict['answer'])
    if i>4:
        break


Question  1 :
write a python function to add two user provided numbers and return the sum

def add_two_numbers(num1, num2):
    sum = num1 + num2
    return sum




Question  2 :
write a python program to add two numbers 

num1 = 2
num2 = 3
sum = num1 + num2
print(f'Sum: {sum}')




Question  3 :
Create a function to calculate the sum of a sequence of integers.

def sum_sequence(sequence):
  sum = 0
  for num in sequence:
    sum += num
  return sum




Question  4 :
Generate a Python code for crawling a website for a specific type of data such as phone numbers.

import requests
import re

def crawl_website_for_phone_numbers(website):
    response = requests.get(website)
    phone_numbers = re.findall('\d{3}-\d{3}-\d{4}', response.text)
    return phone_numbers

if __name__ == '__main__':
    print(crawl_website_for_phone_numbers('www.example.com'))
    
    


Question  5 :
Create a Python list comprehension to get the squared values of a list [1, 2, 3, 5, 8, 13].

[x*x for x in [1, 

### Tokenization

In [5]:
from tokenize import tokenize, untokenize
import io
import keyword
import random

In [6]:
def tokenizerWithDataAugmentation(codeString, maskFactor=0.25):
    
    # dict for storing mapping of the original variable name to their masked versions
    maskedVar = {}   
    
    # List some built in functions that should be ignored when masking and add them to the keyword list
    ignoreList = ['range', 'enumerate', 'print', 'ord', 'int', 'float', 'zip', 'char', 'list', 'dict', 'tuple', 'set', 
                  'len', 'sum', 'min', 'max']
    ignoreList.extend(keyword.kwlist)
    
    # A counter for generating unique name values
    var_counter = 1
    
    # List of generated tokens together with its information
    tokens = list(tokenize(io.BytesIO(codeString.encode('utf-8')).readline))
    
    # Initialize an empty list to store the final tokens
    tokenizedOutput = []

    # Loop through all the tokens
    for token in tokens:
        # Only grab the token type and token string
        token_type, token_string, _, _, _ = token

        # This part is to handle variable naming
        # Checks if the token type == name and the string is not in the ignored list
        if token_type == 1 and token_string not in ignoreList:
            
            # Case 1: If token is in a keyword, add them into ignore list and then append it directly to the output
            if token_string in ['def', '.', 'import', 'raise', 'except', 'class']:
                ignoreList.append(token_string)
                tokenizedOutput.append((token_type, token_string))
                
            # Case 2: If token has already been masked, append the masked version instead
            elif token_string in maskedVar:
                tokenizedOutput.append((token_type, maskedVar[token_string]))
                
            # Case 3: If the random number is greater than 1-mask factor, mask the token name as var x 
            # (where x is the number in the counter). Then add 1 to the counter then appends it to the output
            elif random.uniform(0, 1) > 1 - maskFactor:
                maskedVar[token_string] = f'var{var_counter}'
                var_counter += 1
                tokenizedOutput.append((token_type, maskedVar[token_string]))
            
            # Case 4: Other cases, add the token to the ignore list and append the type and string to the output
            else:    
                ignoreList.append(token_string)
                tokenizedOutput.append((token_type, token_string))
        
        # If the token type is not a name or found in the ignored list, append it directly to the output
        else:
            tokenizedOutput.append((token_type, token_string))
            
    return tokenizedOutput

In [7]:
print(tokenizerWithDataAugmentation(QnADict[0]['answer']))

[(63, 'utf-8'), (1, 'def'), (1, 'var1'), (54, '('), (1, 'num1'), (54, ','), (1, 'num2'), (54, ')'), (54, ':'), (4, '\n'), (5, '    '), (1, 'sum'), (54, '='), (1, 'num1'), (54, '+'), (1, 'num2'), (4, '\n'), (1, 'return'), (1, 'sum'), (4, '\n'), (62, '\n'), (62, '\n'), (6, ''), (0, '')]


In [8]:
print(untokenize(tokenizerWithDataAugmentation(QnADict[0]['answer'])).decode("utf-8"))

def add_two_numbers (num1 ,num2 ):
    sum =num1 +num2 
    return sum 





### Splitting Dataset into Training and Validation

In [9]:
import numpy as np
import pandas as pd

In [10]:
pythonQuestions_df = pd.DataFrame(QnADict)
pythonQuestions_df.head()

Unnamed: 0,question,answer
0,write a python function to add two user provi...,"def add_two_numbers(num1, num2):\n sum = nu..."
1,write a python program to add two numbers \n,num1 = 2\nnum2 = 3\nsum = num1 + num2\nprint(f...
2,Create a function to calculate the sum of a s...,def sum_sequence(sequence):\n sum = 0\n for ...
3,Generate a Python code for crawling a website...,import requests\nimport re\n\ndef crawl_websit...
4,Create a Python list comprehension to get the...,"[x*x for x in [1, 2, 3, 5, 8, 13]]\n\n\n"


In [11]:
np.random.seed(0)

# deciding factor for split the data into 90% training and 10% validation
trainOrValidationSplit = np.random.rand(len(pythonQuestions_df)) < 0.9
train_df = pythonQuestions_df[trainOrValidationSplit]
val_df = pythonQuestions_df[~trainOrValidationSplit]

train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [12]:
print(f"QnADict shape is: {pythonQuestions_df.shape}")
print(f"train_df shape is: {train_df.shape}")
print(f"val_df shape is: {val_df.shape}")

train_df.head(3)
val_df.head(3)

QnADict shape is: (2815, 2)
train_df shape is: (2500, 2)
val_df shape is: (315, 2)


Unnamed: 0,question,answer
0,Generate a REST API with Python and Flask tha...,"from flask import Flask, request\nfrom flask_s..."
1,Write a Python program to calculate the avera...,"\nlist_of_positive_integers = [1, 5, 6, 7, 8]\..."
2,Develop a Python function to predict the clas...,import pandas as pd\ncsv_url = 'http://test.te...


### Vocabulary Construction - torchtext

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext import data
from torchtext.data import Field, BucketIterator, Iterator

import spacy

In [14]:
torchtext.__version__

'0.6.0'

In [15]:
# Seed configuration for reproducability
SEED = 200803
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [16]:
# Loading spacy for its tokenizing function
spacyNLP = spacy.load("en_core_web_sm")

# Transform into lower case
def spacyTokenizer(text):
    return [token.text.lower() for token in spacyNLP(text)]

In [17]:
# Input and Output object defining how they are processed.
# set init token and eos token to empty meaning no special token will be added

# Input field (natural language)
Input = Field(tokenize = spacyTokenizer,
              init_token='', 
              eos_token='', 
              lower=True)

# Output field (code)
Output = Field(tokenize = tokenizerWithDataAugmentation,
               init_token='', 
               eos_token='', 
               lower=False)

In [18]:
# Mapping each processing steps (fields) to its corresponding data
fields = [('Input', Input),('Output', Output)]

In [19]:
# Further data augmentation to increase vocab size
# validation data will not be augmented, just the training data

# Initiate list for training and validation examples
trainExList = []   # Training example list
valExList = []     # Validation example list

# Parameter for augmentation (number of times to replicate training data)
expandXfold = 500

# For 500 iterations
for j in range(expandXfold):
    # For each rows in train_df
    for i in range(train_df.shape[0]):
        try:
            # Create examples for the training data and append it to trainExList
            trainEx = data.Example.fromlist([train_df.question[i], train_df.answer[i]], fields)
            trainExList.append(trainEx)
        except:
            pass


# For each rows in val_df
for i in range(val_df.shape[0]):
    try:
        # Create examples for the validation data and append it to valExExList
        valEx = data.Example.fromlist([val_df.question[i], val_df.answer[i]], fields)
        valExList.append(valEx)
    except:
        pass

In [20]:
# Print out the samples
print("Total training samples:", len(trainExList))
print("Total validation samples:", len(valExList))

Total training samples: 1248500
Total validation samples: 315


In [21]:
# Dataset creation
train_dataset = data.Dataset(trainExList, fields)
validation_dataset =  data.Dataset(valExList, fields)

In [22]:
# Vocab building, min_freq is set to 0 such that all tokens are included regardless of frequency
Input.build_vocab(train_dataset, min_freq = 0)
Output.build_vocab(train_dataset, min_freq = 0)

In [23]:
# Import pickle for serializing vocab objects
import pickle

# Serialize vocab and write them into the Vocab folder
def saveVocabulary(vocab, path):
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()
    
saveVocabulary(Input.vocab, "Vocab/codegenerator_input_vocab.pkl")
saveVocabulary(Output.vocab, "Vocab/codegenerator_output_vocab.pkl")

In [24]:
# Checks if cuda is available, else use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [25]:
train_dataset[0].Output

[(63, 'utf-8'),
 (1, 'def'),
 (1, 'var1'),
 (54, '('),
 (1, 'var2'),
 (54, ','),
 (1, 'var3'),
 (54, ')'),
 (54, ':'),
 (4, '\n'),
 (5, '    '),
 (1, 'sum'),
 (54, '='),
 (1, 'var2'),
 (54, '+'),
 (1, 'var3'),
 (4, '\n'),
 (1, 'return'),
 (1, 'sum'),
 (4, ''),
 (6, ''),
 (0, '')]

### Encoder

In [26]:
class Encoder(nn.Module):
    # Init class
    def __init__(self, 
                 inputDim,                   # Input vocab size
                 hidDim,                     # Hidden dimension (size of the vector representing each token)
                 n_layers,                   # Number of layers (containing attention mechanism + feedforward)
                 n_heads,                    # Number of attention heads
                 posWiseFeedForwardDim,      # Dimension of the position wise feedforward network in each layer
                 dropout,                    # Dropout rate (randomly setting activation to 0)
                 device,
                 maxLength = 1000):          # Max length of input sequence for positional encoding
        super().__init__()
        
        # Device
        self.device = device
        
        # Embed the token (which is currently in a form of integer indexes) into a dense vector of size hidDim
        self.tokenEmbedding = nn.Embedding(inputDim, hidDim)
        
        # Positional Embedding: Embed the order of the tokens in the sequence, 
        # helping model to understand the language structure
        self.positionalEmbedding = nn.Embedding(maxLength, hidDim)
        
        # List of encoder layers (containing multi headed attention and feed forward network)
        self.layers = nn.ModuleList([EncoderLayer(hidDim, 
                                                  n_heads, 
                                                  posWiseFeedForwardDim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        # Regularization with droupout preventing overfitting
        self.dropout = nn.Dropout(dropout)
        
        # Scaling factor, help normalize input embedding, avoiding the embeddings from being too large/too small
        # Scaling factor is square root of the Q or K dimension (hidDim)
        self.scale = torch.sqrt(torch.FloatTensor([hidDim])).to(device)
     
    
    
    # Forward pass
    # src is the input sequence
    # srcMask is to prevent handling specific tokens such as padding tokens
    def forward(self, src, srcMask):
        
        batchSize = src.shape[0]
        srcLength = src.shape[1]
        
        # This will be [batchSize , srcLength]
        # Create positional index for each token and expand it (add batch dimension), repeat for all sequence in the batch
        pos = torch.arange(0, srcLength).unsqueeze(0).repeat(batchSize, 1).to(self.device)
        
        # Embedding and positional encoding
        # Converts input token indices into dense vectors, normalize by scaling them, and adding position information
        # Applies dropout to the result for regularization
        src = self.dropout((self.tokenEmbedding(src) * self.scale) + self.positionalEmbedding(pos))
        
        
        # Encoder Layer
        # For each encoder layer,
        for layer in self.layers:
            # Applies self attention and feed forward
            src = layer(src, srcMask)
            
        return src
     

In [27]:
# Encoder Layer
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hidDim,                   # Hidden dimension (size of the vector representing each token)
                 n_heads,                  # Number of attention heads
                 posWiseFeedForwardDim,    # Dimension of the position wise feedforward network in each layer
                 dropout,                  # Dropout rate (randomly setting activation to 0)
                 device):
        super().__init__()
        
        # Layer normalization after self attention to stabilize output
        self.layerNormalizationForselfAttention = nn.LayerNorm(hidDim)
        # Layer normalization after feed forward network to avoid output growing too large/small  
        # after applying activation function
        self.layerNormalizationForFeedForward = nn.LayerNorm(hidDim)
        
        
        # Multi head attention. Dropout rate is introduced to regularize the model
        self.multiHeadAttention = MultiHeadAttentionLayer(hidDim, n_heads, dropout, device)
        
        # Apply position wise feed forward network to each position of the sequence
        self.positionwiseFeedForward = PositionwiseFeedforwardLayer(hidDim, 
                                                                    posWiseFeedForwardDim, 
                                                                    dropout)
        
        # Applies dropout to the output of the attention and feedforward layers to prevent overfitting
        self.dropout = nn.Dropout(dropout)
    
    
    
    # Forward pass
    def forward(self, src, srcMask): 
                
        # Self attention, returns a multihead self attention output (attentionOutput), and an ignored weight (_)
        # srcMask is used to prevent attention to certain positions (e.g., padding tokens)
        # src (input sequence) is used as query, key, and value
        attentionOutput, _ = self.multiHeadAttention(src, src, src, srcMask)
        
        # Add input to the output of the self attention, preserving the original information
        # Applies dropout to regularize model
        # Normalize the combined output
        src = self.layerNormalizationForselfAttention(src + self.dropout(attentionOutput))
        
        
        # Positionwise feedforward with the output of the self attention layer before as input
        attentionOutput = self.positionwiseFeedForward(src)
        
        # Add input to the output of the Positionwise feedforward network, again, to help preserve the original information
        # Applies dropout to regularize model
        # Normalize the combined output
        src = self.layerNormalizationForFeedForward(src + self.dropout(attentionOutput))
        
        
        return src

In [28]:
# Positionwise FeedforwardLayer
class PositionwiseFeedforwardLayer(nn.Module):
    
    def __init__(self, hidDim, posWiseFeedForwardDim, dropout):
        super().__init__()
        
        # A fully connected layer mapping from hidDim to posWiseFeedForwardDim (transform input into higher dimension)
        self.fullyConnectedLayer_1 = nn.Linear(hidDim, posWiseFeedForwardDim)
        # A fully connected layer mapping from posWiseFeedForwardDim to hidDim (transform dimension back to original)
        self.fullyConnectedLayer_2 = nn.Linear(posWiseFeedForwardDim, hidDim)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
    
    # Forward Pass
    def forward(self, x):
        
        # Pass to first fully connected layer
        # Applies ReLU activation function (keeps only positive values)
        # Then, apply dropout to the result
        x = self.dropout(torch.relu(self.fullyConnectedLayer_1(x)))
        
        # Pass to the second fully connected layer to transform it back to the original
        x = self.fullyConnectedLayer_2(x)
        
        return x

In [29]:
# Multihead Attention Layer
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hidDim, n_heads, dropout, device):
        super().__init__()
        
        # Ensure that the hidden dimension is divisible by the number of attention heads
        assert hidDim % n_heads == 0
        
        
        # Dimension storing
        # Total dimension of the hidden states
        self.hidDim = hidDim
        # Total number of attention heads
        self.n_heads = n_heads
        # Size of each attention head
        self.headDim = hidDim // n_heads
        
        
        # Fully Connected Layers
        # Transform input sequences (query, key and value) into the same hidden dimension
        self.fullyConnectedLayer_q = nn.Linear(hidDim, hidDim)
        self.fullyConnectedLayer_k = nn.Linear(hidDim, hidDim)
        self.fullyConnectedLayer_v = nn.Linear(hidDim, hidDim)
        # Combine output of all the attention heads back to the original hidden dimension size
        self.fullyConnectedLayer_o = nn.Linear(hidDim, hidDim)
          
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Scaling to avoid large values in softmax function
        self.scale = torch.sqrt(torch.FloatTensor([self.headDim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batchSize = query.shape[0]
        
        # Transform the input to a space with the same hidden dimension   
        Q = self.fullyConnectedLayer_q(query)
        K = self.fullyConnectedLayer_k(key)
        V = self.fullyConnectedLayer_v(value)
        
        
        # Reshape Q, K, and V to split the hidden dimension into multiple heads
        # Each head process a portion of the dimension headDim
        # Rearrange dimension to separate the heads as a separate dimension to ease attention computation of each head
        # Shape is transformed from [batchSize, srclength, n_heads, hidDim] 
        # into [batch size, n_heads, srclength, headDim] for dot product
        Q = Q.view(batchSize, -1, self.n_heads, self.headDim).permute(0, 2, 1, 3)
        K = K.view(batchSize, -1, self.n_heads, self.headDim).permute(0, 2, 1, 3)
        V = V.view(batchSize, -1, self.n_heads, self.headDim).permute(0, 2, 1, 3)
        
        
        # Compute dot product of Q and K for attention score
        # Scaled to avoid excessively large values
        focus = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        
        # Apply mask to prevent attention to certain tokens (ex: padding tokens)
        # Masked positions are set to very large negative number such that when softmax is applied, it becomes 0
        if mask is not None:
            focus = focus.masked_fill(mask == 0, -1e10)

        
        # Apply softmax to get the attention weight
        attention = torch.softmax(focus, dim = -1)

        
        # Apply dropout on attention weight, then calculate weighted sum of the values
        x = torch.matmul(self.dropout(attention), V)

        
        # Permute the dimension
        x = x.permute(0, 2, 1, 3).contiguous()
        # Combine the output from all attention into a single dimension (hidDim)
        x = x.view(batchSize, -1, self.hidDim)

        
        # Apply linear transformation to combine the output back to the hidDim size
        x = self.fullyConnectedLayer_o(x)

        
        return x, attention

### Decoder

In [30]:
class Decoder(nn.Module):
    def __init__(self, 
                 outputDim,                 # Output vocab size
                 hidDim,                    # Hidden dimension (size of the vector representing each token)
                 n_layers,                  # Number of layers
                 n_heads,                   # Number of attention heads
                 posWiseFeedForwardDim,     # Dimension of the position wise feedforward network in each layer
                 dropout,                   # Dropout rate (randomly setting activation to 0)
                 device,
                 maxLength = 10000):        # Max length of sequence for the decoder to handle
        super().__init__()
        
        self.device = device
        
        # Embed the token (which is currently in a form of integer indexes) into a dense vector of size hidDim
        self.tokenEmbedding = nn.Embedding(outputDim, hidDim)
        # Embed the order of the tokens in the sequence, helping model to understand the language structure
        self.positionalEmbedding = nn.Embedding(maxLength, hidDim)
        
        # List of n_layers amount of decoder layers
        self.layers = nn.ModuleList([DecoderLayer(hidDim, 
                                                  n_heads, 
                                                  posWiseFeedForwardDim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        # A fully connected layer transforming hidDim back into output vocab dimension
        # This is used to predict the next token in the sequence
        self.fullyConnectedLayer_out = nn.Linear(hidDim, outputDim)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        
        # Scaling factor, help normalize the embedding, avoiding from it being too large/too small
        self.scale = torch.sqrt(torch.FloatTensor([hidDim])).to(device)
        
        
    # Forward pass
    # trg is the input sequence for the decoder
    # encoderSrc is the encoder's output
    # trgMask and srcMask are both to prevent handling specific tokens such as padding tokens    
    def forward(self, trg, encoderSrc, trgMask, srcMask):
        
        #trg = [batch size, trg len]
        #encoderSrc = [batch size, src len, hid dim]
        #trgMask = [batch size, 1, trg len, trg len]
        #srcMask = [batch size, 1, 1, src len]
        
        batchSize = trg.shape[0]
        trgLength = trg.shape[1]
        
        # Positional encoding
        # Create positional index for each token and expand it to match the batch size
        pos = torch.arange(0, trgLength).unsqueeze(0).repeat(batchSize, 1).to(self.device)
        
        # Converts input token indices into dense vectors, normalize by scaling them, and adding position information
        # Applies dropout to the result
        trg = self.dropout((self.tokenEmbedding(trg) * self.scale) + self.positionalEmbedding(pos))
        
        
        # For each decoder layer,(Multi headed attention and Feed forward)
        for layer in self.layers:
            # Returns updated target sequence and attention weight 
            # representing how much each token in the target sequence attends to tokens in the source sequence
            trg, attention = layer(trg, encoderSrc, trgMask, srcMask)

        
        # Linear classification 
        output = self.fullyConnectedLayer_out(trg)

        # Returns output and attention weight
        return output, attention

In [31]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hidDim,                    # Hidden dimension (size of the vector representing each token)
                 n_heads,                   # Number of attention heads
                 posWiseFeedForwardDim,     # Dimension of the position wise feedforward network in each layer
                 dropout,                   # Dropout rate (randomly setting activation to 0)
                 device):
        super().__init__()
        
        # Layer normalization after self attention to stabilize output
        self.layerNormalizationForselfAttention = nn.LayerNorm(hidDim)
        
        # Layer normalization after attention mechanism involving encoder's output.
        self.layerNormalizationForEncOutputselfAttention = nn.LayerNorm(hidDim)
        
        # Layer normalization after feed forward network to avoid output growing too large/small  
        # after applying activation function
        self.layerNormalizationForFeedForward = nn.LayerNorm(hidDim)
        
        
        # Multi head attention. Dropout rate is introduced to regularize the model
        self.multiHeadAttention = MultiHeadAttentionLayer(hidDim, n_heads, dropout, device)
        
        # Multi head attention focusing on encoder's output when generating sequence. 
        # Dropout rate is introduced to regularize the model
        self.encodermultiHeadAttention = MultiHeadAttentionLayer(hidDim, n_heads, dropout, device)
        
        # Apply position wise feed forward network to each position in the input sequence
        self.positionwiseFeedForward = PositionwiseFeedforwardLayer(hidDim, 
                                                                     posWiseFeedForwardDim, 
                                                                     dropout)
        
        # Applies dropout
        self.dropout = nn.Dropout(dropout)
        
        
    
    # Forward Pass
    def forward(self, trg,  encoderSrc, trgMask, srcMask):

        # Self-attention within the target sequence
        # trg is used as query, key, and value
        # trgMask prevents attention to certain positions (like future tokens)
        attentionOutput, _ = self.multiHeadAttention(trg, trg, trg, trgMask)
        
        
        # Add input to the output of the self-attention, help preserving the original information
        # Applies dropout to regularize the model
        # Normalize the combined output
        trg = self.layerNormalizationForselfAttention(trg + self.dropout(attentionOutput))

            
        # Attention to allow the decoder focusing on the encoder's output
        # encoderSrc is used for key and value while trg is used for query
        attentionOutput, attention = self.encodermultiHeadAttention(trg,  encoderSrc,  encoderSrc, srcMask)
        
        # Add input to the output of the encoder output focused attention, help preserving the original information
        # Applies dropout to regularize model
        # Normalize the combined output
        trg = self.layerNormalizationForEncOutputselfAttention(trg + self.dropout(attentionOutput))
        
        # Positionwise feedforward with the output encoder focused output attention layer before as input
        attentionOutput = self.positionwiseFeedForward(trg)
        
        # Add input to the output of the positionwise feedforward network, again, to help preserve the original information
        # Applies dropout to regularize the model
        # Normalize the combined output
        trg = self.layerNormalizationForFeedForward(trg + self.dropout(attentionOutput))
        
        
        return trg, attention

### Sequence to Sequence Model

In [32]:
class Sequence2SequenceModel(nn.Module):
    def __init__(self, 
                 encoder,             # Encoder for the model
                 decoder,             # Decoder for the model
                 srcPaddingIndex,     # Padding index for the source sequence
                 trgPaddingIndex,     # Padding index for the target sequence
                 device):
        super().__init__()
        
        self.srcPaddingIndex = srcPaddingIndex
        self.trgPaddingIndex = trgPaddingIndex
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
    
    # Create mask to ignore padding tokens in source
    def createSrcMask(self, src):
        # Returns true if it is not a padding token, and false otherwise
        # Add dimension to align shape with the shape of the attention mechanism
        # unsqueeze 1 is for attention heads, and unsqueeze 2 is for target sequence
        srcMask = (src != self.srcPaddingIndex).unsqueeze(1).unsqueeze(2)

        return srcMask
    
    
    # Create mask to ignore padding tokens in target
    def createTrgMask(self, trg):
        # Returns true if it is not a padding token, and false otherwise
        # Add dimension to align shape with the shape of the attention mechanism
        # unsqueeze 1 is for attention heads, and unsqueeze 2 is for target sequence
        trgPaddingMask = (trg != self.trgPaddingIndex).unsqueeze(1).unsqueeze(2)
        
        trgLength = trg.shape[1]
        
        # Look ahead mask to make sure that the model focus on the current and previous position, not future positions
        # Creates a lower triangular matrix and convert the content into boolean values
        # Triangular matrix is used since it ensure that each token can only "focus" on itself and the tokens before it
        # Simulating real predictions
        lookaheadMask = torch.tril(torch.ones((trgLength, trgLength), device = self.device)).bool()
        
        # Combine both masks to produce target mask
        trgMask = trgPaddingMask & lookaheadMask
        
        return trgMask
    
    
    # Feed forward
    def forward(self, src, trg):
        
        # Creates the mask for both src and trg
        srcMask = self.createSrcMask(src)
        trgMask = self.createTrgMask(trg)
        
        # Pass input into the sequence
        encoderSrc = self.encoder(src, srcMask)
        # Pass target and output of the encoder into decoder        
        output, attention = self.decoder(trg, encoderSrc, trgMask, srcMask)
        
        return output, attention

### Training Setup

In [33]:
# Inputs
InputDimension = len(Input.vocab)
OutputDimension = len(Output.vocab)

# Hidden dimension
HiddenDimension = 256

# n_layers
EncLayers = 3
DecLayers = 3

# n_heads
EncHeads = 16
DecHeads = 16

# posWiseFeedForwardDim
EncPositionWiseFeedForwardDimension = 512
DecPositionWiseFeedForwardDimension = 512

# Dropout
EncDropout = 0.1
DecDropout = 0.1

# Initialize encoder with the set attributes
enc = Encoder(InputDimension,
              HiddenDimension,
              EncLayers,
              EncHeads,
              EncPositionWiseFeedForwardDimension,
              EncDropout,
              device)

# Initialize decoder with the set attributes
dec = Decoder(OutputDimension,
              HiddenDimension,
              DecLayers,
              DecHeads,
              DecPositionWiseFeedForwardDimension,
              DecDropout,
              device)

In [34]:
# Store numerical index of the padding token
srcPaddingIndexInput = Input.vocab.stoi[Input.pad_token]
trgPaddingIndexInput = Output.vocab.stoi[Output.pad_token]

# Initialize the encoder-decoder model
model = Sequence2SequenceModel(enc, dec, srcPaddingIndexInput, trgPaddingIndexInput, device).to(device)

In [35]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,789,288 trainable parameters


In [36]:
# Init weights in order to avoid weight values that are too large/small
def initWeights(module):
    weight = getattr(module, 'weight', None)
    # checks if module has weights and if the dimension > 1
    if weight is not None and weight.dim() > 1:
        # Glorot (Xavier) init was choosen since it is often used for general purpose init
        nn.init.xavier_uniform_(weight)

# Apply weight
model.apply(initWeights);

In [37]:
LEARNING_RATE = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)

### Loss Function

In [38]:
import math
import torch.nn.functional as F


# Cross Entropy Loss accomodating distributions, label smoothening, and accepting raw logits
class CrossEntropyLoss(nn.CrossEntropyLoss):

    def __init__(self, 
                 weight=None,               # Weight
                 ignore_index=-100,         # Ignore index
                 reduction='mean',          # Loss reduction, currently set to average
                 smooth_eps=None,           # Smoothing epsilon
                 smooth_dist=None,          # Smoothing distribution
                 areLogits=True):           # Boolean for raw logits/ log probabilities
        super(CrossEntropyLoss, self).__init__(weight=weight,
                                               ignore_index=ignore_index, 
                                               reduction=reduction)
        
        # Smooth epsilon
        self.smooth_eps = smooth_eps
        
        # Smooth distributions
        self.smooth_dist = smooth_dist
        
        # Logits
        self.areLogits = areLogits
    
    
    def forward(self, input, target, smooth_dist=None):
        # If distribution is not provided
        if smooth_dist is None:
            # Use self attribute
            smooth_dist = self.smooth_dist
            
        # Custom cross entropy function, returns the loss amount 
        return crossEntropy(input, 
                             target, 
                             weight=self.weight, 
                             ignore_index=self.ignore_index,
                             reduction=self.reduction, 
                             smooth_eps=self.smooth_eps,
                             smooth_dist=smooth_dist, 
                             areLogits=self.areLogits)

In [39]:
def crossEntropy(inputs,               # Inputs
                  target,               # Ground truth
                  weight=None,          # Weight
                  ignore_index=-100,    # Ignore specific tokens
                  reduction='mean',     # Reduce loss
                  smooth_eps=None,      # Smoothing Epsilon 
                  smooth_dist=None,     # Smoothing distribution
                  areLogits=True):      # Boolean for raw logits/ log probabilities
    
    # Sets smooth epsilon to 0 if not provided
    if smooth_eps:
        smooth_eps = smooth_eps
    else:
        smooth_eps = 0
    
    
    
    # LOGITS OR PROBABILITY
    # If target is a class index and no smoothening is applied
    if isLongTensor(target) and smooth_eps == 0:
        # If inputs are raw logits
        if areLogits:
            # Use cross entropy loss
            return F.cross_entropy(inputs, 
                                   target, 
                                   weight, 
                                   ignore_index=ignore_index, 
                                   reduction=reduction)
        # If inputs are probability
        else:
            # Use negative loss likelihood loss
            return F.nll_loss(inputs, 
                              target, 
                              weight, 
                              ignore_index=ignore_index, 
                              reduction=reduction)
        
    
    # LOG SOFTMAX
    # If inputs are raw logits
    if areLogits:
        # Applies softmax to convert it into probability
        # Then take the log of the probability
        logSoftMax = F.log_softmax(inputs, dim=-1)
    # If inputs are already log-probability
    else:
        logSoftMax = inputs
    
    
        
    # IGNORE MASK 
    # Init mask for ignored index for the loss  
    ignoreIndexMask = None
    # Number of classes from the inner most dimension of the input
    n_classes = inputs.size(-1)

    # If target is a class index and ignore index is >0
    if isLongTensor(target) and ignore_index >= 0:
        # Create boolean tensor where each element is true, false otherwise
        ignoreIndexMask = target.eq(ignore_index)
    
    
    
    # LABEL SMOOTHENING
    # If smooth epsilon > 0 and smooth distribution is provided
    if smooth_eps > 0 and smooth_dist is not None:
        # If target is a class index
        if isLongTensor(target):
            # one hot encoding (add number of classes) into binary vector, ensure type is the same as input
            target = oneHotEncoding(target, n_classes).type_as(inputs)
        # If smooth_dist dimension < target dimension    
        if smooth_dist.dim() < target.dim():
            #  Add dimension to smooth dist to make sure it can adjust to match the target dimension
            smooth_dist = smooth_dist.unsqueeze(0)
        # Linear interpolation, blends target with smooth dist using smooth epsilon
        target.lerp_(smooth_dist, smooth_eps)
    
    
    
    # WEIGHTED LOGARITHMIC SOFTMAX
    # If weight is provided (to apply different importance to different class)
    if weight is not None:
        # Add dimension to the weight to accomodate  logSoftMax shape on the first dimension
        # Multiplies logSoftMax with class weight
        logSoftMax = logSoftMax * weight.unsqueeze(0)
    
    
    
    # LOSS COMPUTATION
    # If target is a class index
    if isLongTensor(target):
        # Smoothening amount = smoothening epsilon / number of classes
        eps_sum = smooth_eps / n_classes
        # Negative log likelihood
        eps_nll = 1. - eps_sum - smooth_eps
        # Extract log probability of the target class from logSoftMax
        likelihood = logSoftMax.gather(dim=-1, index=target.unsqueeze(-1)).squeeze(-1)
        # Weighted negative log likelihood of true class (eps_nll * likelihood)
        # Then compute the contribution of label smoothening
        # Finally, set to negative to get loss
        loss = -(eps_nll * likelihood + eps_sum * logSoftMax.sum(-1))
    else:
        loss = -(target * logSoftMax).sum(-1)
    
    
    
    # If ignoreIndexMask is provided
    if ignoreIndexMask is not None:
        # Ignore specific value
        loss.masked_fill_(ignoreIndexMask, 0)
    
    
    
    # REDUCTION SET
    # If reduction is set to sum
    if reduction == 'sum':
        # Loss will be sum of all loss values
        loss = loss.sum()
    # If reduction is set to mean
    elif reduction == 'mean':
        # If no index ignored
        if ignoreIndexMask is None:
            # Simply calculate mean of all loss
            loss = loss.mean()
        # If present
        else:
            # loss will be the sum of all loss / Number of unignored elements
            loss = loss.sum() / float(loss.size(0) - ignoreIndexMask.sum())

    return loss

In [40]:
# One hot encoding (represent categorical data as binary vector)
def oneHotEncoding(categoryIndex, n_possibleCategories=None, ignore_index=None):

    # If number of possible categories not provided, 
    if n_possibleCategories is None:
        # Maximum category index set to the maximum value in category index + 1
        n_possibleCategories = categoryIndex.max() + 1
        
    # Index size list to determine output shape
    indexSize = list(categoryIndex.size())
    
    # Create new output tensor, set to uint8,
    # Unpack and resize indexSize to add n_possibleCategories
    # Init them with 0s
    output = categoryIndex.new().byte().resize_(*indexSize, n_possibleCategories).zero_()
    
    # Set specific position (innermost dimension) of the output to 1 based on category index
    output.scatter_(-1, categoryIndex.unsqueeze(-1), 1)
    
    # If ignore index is not none and negative
    if ignore_index is not None and ignore_index >= 0:
        # Boolean mask, returns true if category index == ignore_index, vice versa
        # Replace with 0 where the output is true
        output.masked_fill_(categoryIndex.eq(ignore_index).unsqueeze(-1), 0)
    return output

In [41]:
# Check if it is a long tensor to determine if it is a class index
def isLongTensor(target):
    # If it target has attribute data
    if hasattr(target, 'data'):
        target = target.data
    
    # Return true if its either CPU tensor or GPU tensor, else return false
    return isinstance(target, torch.LongTensor) or isinstance(target, torch.cuda.LongTensor)

In [42]:
def maskedNegativeLogLikelihoodLoss(inputPrediction, target, mask):
    n_validTotal = mask.sum()
    
    # Init CrossEntropyLoss class
    crossEntropy = CrossEntropyLoss(ignore_index = trgPaddingIndexInput, smooth_eps=0.15)
    # Cross entropy loss computation with inputPrediction and target
    loss = crossEntropy(inputPrediction, target)
    loss = loss.to(device)
    # Return loss and the number of valid elements (converted into int)
    return loss, n_validTotal.item()

In [43]:
criterion = maskedNegativeLogLikelihoodLoss

### Training

In [44]:
from tqdm import tqdm

# Target mask to handle certain tokens and prevent future tokens
def createTrgMask(trg):
    
    # Boolean padding mask where padding tokens are set to false, otherwise true
    # Added dimension to allow broadcasting into attention mechanism
    trgPaddingMask = (trg != trgPaddingIndexInput).unsqueeze(1).unsqueeze(2)
    
    # Target length
    trgLength = trg.shape[1]
    
    # Look ahead mask to make sure that the model focus on the current and previous position, not future positions
    # Creates a lower triangular matrix and convert the content into boolean values
    # Triangular matrix is used since it ensure that each token can only "focus" on itself and the tokens before it
    # Simulating real predictions
    lookaheadMask = torch.tril(torch.ones((trgLength, trgLength), device = device)).bool()
    
    # Combine mask into target mask
    trgMask = trgPaddingMask & lookaheadMask

    return trgMask

In [45]:
def train(model, iterator, optimizer, criterion, clip):
    
    # Set to training mode
    model.train()
    
    # Number of tokens processed
    processedTokens = 0
    # Loss values
    totalLoss = []
    
    # For each batch (use tqdm for progress bar)
    for i, batch in tqdm(enumerate(iterator), total=len(iterator)):
        
        loss = 0
        
        # Rearrange source and target to match model input shape: (sequence length, batch size)
        src = batch.Input.permute(1, 0)
        trg = batch.Output.permute(1, 0)
        
        # Target Mask
        trgMask = createTrgMask(trg)
        
        # Reset optimizer gradient
        optimizer.zero_grad()
        
        # Forward pass, target shifted to predicts next token
        output, _ = model(src, trg[:, :-1])  
        
        # Get output dimension
        outputDim = output.shape[-1]
        # Reshape into [batch size * trg len - 1, output dim]
        output = output.contiguous().view(-1, outputDim)
        # Reshape into [batch size * trg len - 1]
        trg = trg[:, 1:].contiguous().view(-1)  
        
        # Loss computation
        loss, n_validTotal = criterion(output, trg, trgMask)
        
        # Back propagation
        loss.backward()
        
        # Gradient clipping to avoid gradient getting too big
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        # Optimization step
        optimizer.step()
        
        totalLoss.append(loss.item() * n_validTotal)
        processedTokens += n_validTotal
        
    return sum(totalLoss) / processedTokens

In [46]:
def evaluate(model, iterator, criterion):
    
    # Set to evaluation mode
    model.eval()
    
    processedTokens = 0
    totalLoss = []
    
    with torch.no_grad():
        
        # For each batch (use tqdm for progress bar)
        for i, batch in tqdm(enumerate(iterator), total=len(iterator)):
            
            # Rearrange source and target to match model input shape: (sequence length, batch size)
            src = batch.Input.permute(1, 0)
            trg = batch.Output.permute(1, 0)
            
            # Target Mask
            trgMask = createTrgMask(trg)
            
            # Forward pass, target shifted to predicts next token
            output, _ = model(src, trg[:,:-1])
            
            # Get output dimension
            outputDim = output.shape[-1]
            # Reshape into [batch size * trg len - 1, output dim]
            output = output.contiguous().view(-1, outputDim)
            # Reshape into [batch size * trg len - 1]
            trg = trg[:,1:].contiguous().view(-1)
            
            # Loss computation
            loss, n_validTotal = criterion(output, trg, trgMask)
            
            totalLoss.append(loss.item() * n_validTotal)
            processedTokens += n_validTotal

    return sum(totalLoss) / processedTokens

In [47]:
def epochTime(startTime, endTime):
    elapsedTime = endTime - startTime
    elapsedMins = int(elapsedTime / 60)
    elapsedSecs = int(elapsedTime - (elapsedMins * 60))

    return elapsedMins, elapsedSecs

In [48]:
import time

# Number of epochs
n_epochs = 30
# Gradient clipping value
clip = 1
# Save best validation loss
bestValidationLoss = float('inf')

# For each epoch
for epoch in range(n_epochs):
    
    # Set start time
    startTime = time.time()
    
    # Training and Validation data
    trainExList = []
    valExList = []
    
    
    # For each rows in train_df
    for i in range(train_df.shape[0]):
        try:
            # Create examples for the training data and append it to trainExList
            trainEx = data.Example.fromlist([train_df.question[i], train_df.answer[i]], fields)
            trainExList.append(trainEx)
        except Exception as e:
            pass


    # For each rows in val_df
    for i in range(val_df.shape[0]):
        try:
            # Create examples for the validation data and append it to valExExList
            valEx = data.Example.fromlist([val_df.question[i], val_df.answer[i]], fields)
            valExList.append(valEx)
        except Exception as e:
            pass
      
    
    # Dataset creation
    trainingDataset = data.Dataset(trainExList, fields)
    validationDataset =  data.Dataset(valExList, fields)
    
    # Set batch size
    batchSize = 16
    
    # Bucket iterator
    trainingIterator, validationIterator = BucketIterator.splits((trainingDataset, validationDataset), 
                                                                 batch_size = batchSize, 
                                                                 sort_key = lambda x: len(x.Input),
                                                                 sort_within_batch=True, 
                                                                 device = device)
    
    # Set training loss and validation loss
    trainingLoss = train(model, trainingIterator, optimizer, criterion, clip)
    validationLoss = evaluate(model, validationIterator, criterion)
    
    endTime = time.time()
    
    mins, secs = epochTime(startTime, endTime)
    
    if validationLoss < bestValidationLoss:
        bestValidationLoss = validationLoss
        torch.save(model.state_dict(), 'Model/FYPSeq2SeqModel.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {mins}m {secs}s')
    print(f'\tTraining Loss: {trainingLoss:.3f} | Training perplexity: {math.exp(trainingLoss):7.2f}')
    print(f'\t Validation Loss: {validationLoss:.3f} |  Validation perplexity: {math.exp(validationLoss):7.2f}')

100%|██████████| 157/157 [00:04<00:00, 37.55it/s]
100%|██████████| 20/20 [00:00<00:00, 81.18it/s]


Epoch: 01 | Time: 0m 11s
	Training Loss: 4.943 | Training perplexity:  140.22
	 Validation Loss: 4.272 |  Validation perplexity:   71.65


100%|██████████| 157/157 [00:04<00:00, 38.50it/s]
100%|██████████| 20/20 [00:00<00:00, 88.03it/s]


Epoch: 02 | Time: 0m 11s
	Training Loss: 3.971 | Training perplexity:   53.01
	 Validation Loss: 4.026 |  Validation perplexity:   56.06


100%|██████████| 157/157 [00:03<00:00, 40.28it/s]
100%|██████████| 20/20 [00:00<00:00, 90.13it/s]


Epoch: 03 | Time: 0m 11s
	Training Loss: 3.727 | Training perplexity:   41.55
	 Validation Loss: 3.924 |  Validation perplexity:   50.62


100%|██████████| 157/157 [00:03<00:00, 39.42it/s]
100%|██████████| 20/20 [00:00<00:00, 90.44it/s]


Epoch: 04 | Time: 0m 11s
	Training Loss: 3.566 | Training perplexity:   35.36
	 Validation Loss: 3.829 |  Validation perplexity:   46.00


100%|██████████| 157/157 [00:03<00:00, 40.67it/s]
100%|██████████| 20/20 [00:00<00:00, 81.76it/s] 


Epoch: 05 | Time: 0m 11s
	Training Loss: 3.438 | Training perplexity:   31.13
	 Validation Loss: 3.784 |  Validation perplexity:   43.99


100%|██████████| 157/157 [00:03<00:00, 40.44it/s]
100%|██████████| 20/20 [00:00<00:00, 89.08it/s]


Epoch: 06 | Time: 0m 11s
	Training Loss: 3.320 | Training perplexity:   27.67
	 Validation Loss: 3.769 |  Validation perplexity:   43.33


100%|██████████| 157/157 [00:03<00:00, 39.51it/s]
100%|██████████| 20/20 [00:00<00:00, 90.78it/s]


Epoch: 07 | Time: 0m 11s
	Training Loss: 3.224 | Training perplexity:   25.13
	 Validation Loss: 3.705 |  Validation perplexity:   40.64


100%|██████████| 157/157 [00:04<00:00, 37.41it/s]
100%|██████████| 20/20 [00:00<00:00, 91.50it/s]


Epoch: 08 | Time: 0m 14s
	Training Loss: 3.145 | Training perplexity:   23.22
	 Validation Loss: 3.674 |  Validation perplexity:   39.39


100%|██████████| 157/157 [00:04<00:00, 38.11it/s]
100%|██████████| 20/20 [00:00<00:00, 91.55it/s]


Epoch: 09 | Time: 0m 11s
	Training Loss: 3.052 | Training perplexity:   21.16
	 Validation Loss: 3.677 |  Validation perplexity:   39.55


100%|██████████| 157/157 [00:03<00:00, 40.99it/s]
100%|██████████| 20/20 [00:00<00:00, 86.09it/s] 


Epoch: 10 | Time: 0m 11s
	Training Loss: 2.987 | Training perplexity:   19.82
	 Validation Loss: 3.668 |  Validation perplexity:   39.18


100%|██████████| 157/157 [00:03<00:00, 40.36it/s]
100%|██████████| 20/20 [00:00<00:00, 90.29it/s]


Epoch: 11 | Time: 0m 11s
	Training Loss: 2.921 | Training perplexity:   18.55
	 Validation Loss: 3.611 |  Validation perplexity:   37.00


100%|██████████| 157/157 [00:03<00:00, 40.39it/s]
100%|██████████| 20/20 [00:00<00:00, 90.17it/s]


Epoch: 12 | Time: 0m 11s
	Training Loss: 2.865 | Training perplexity:   17.55
	 Validation Loss: 3.581 |  Validation perplexity:   35.91


100%|██████████| 157/157 [00:03<00:00, 39.81it/s]
100%|██████████| 20/20 [00:00<00:00, 90.72it/s]


Epoch: 13 | Time: 0m 11s
	Training Loss: 2.819 | Training perplexity:   16.77
	 Validation Loss: 3.585 |  Validation perplexity:   36.05


100%|██████████| 157/157 [00:03<00:00, 41.03it/s]
100%|██████████| 20/20 [00:00<00:00, 89.29it/s]


Epoch: 14 | Time: 0m 11s
	Training Loss: 2.762 | Training perplexity:   15.84
	 Validation Loss: 3.551 |  Validation perplexity:   34.86


100%|██████████| 157/157 [00:04<00:00, 38.94it/s]
100%|██████████| 20/20 [00:00<00:00, 90.98it/s]


Epoch: 15 | Time: 0m 11s
	Training Loss: 2.724 | Training perplexity:   15.24
	 Validation Loss: 3.596 |  Validation perplexity:   36.45


100%|██████████| 157/157 [00:03<00:00, 40.12it/s]
100%|██████████| 20/20 [00:00<00:00, 91.12it/s]


Epoch: 16 | Time: 0m 11s
	Training Loss: 2.671 | Training perplexity:   14.45
	 Validation Loss: 3.571 |  Validation perplexity:   35.56


100%|██████████| 157/157 [00:03<00:00, 39.92it/s]
100%|██████████| 20/20 [00:00<00:00, 91.52it/s]


Epoch: 17 | Time: 0m 11s
	Training Loss: 2.632 | Training perplexity:   13.91
	 Validation Loss: 3.538 |  Validation perplexity:   34.38


100%|██████████| 157/157 [00:04<00:00, 38.85it/s]
100%|██████████| 20/20 [00:00<00:00, 89.89it/s]


Epoch: 18 | Time: 0m 11s
	Training Loss: 2.605 | Training perplexity:   13.53
	 Validation Loss: 3.546 |  Validation perplexity:   34.68


100%|██████████| 157/157 [00:03<00:00, 40.35it/s]
100%|██████████| 20/20 [00:00<00:00, 90.30it/s]


Epoch: 19 | Time: 0m 11s
	Training Loss: 2.575 | Training perplexity:   13.13
	 Validation Loss: 3.565 |  Validation perplexity:   35.33


100%|██████████| 157/157 [00:03<00:00, 40.99it/s]
100%|██████████| 20/20 [00:00<00:00, 91.12it/s]


Epoch: 20 | Time: 0m 11s
	Training Loss: 2.544 | Training perplexity:   12.73
	 Validation Loss: 3.567 |  Validation perplexity:   35.42


100%|██████████| 157/157 [00:04<00:00, 39.04it/s]
100%|██████████| 20/20 [00:00<00:00, 88.82it/s]


Epoch: 21 | Time: 0m 11s
	Training Loss: 2.516 | Training perplexity:   12.37
	 Validation Loss: 3.576 |  Validation perplexity:   35.74


100%|██████████| 157/157 [00:04<00:00, 38.48it/s]
100%|██████████| 20/20 [00:00<00:00, 90.88it/s]


Epoch: 22 | Time: 0m 11s
	Training Loss: 2.488 | Training perplexity:   12.04
	 Validation Loss: 3.513 |  Validation perplexity:   33.56


100%|██████████| 157/157 [00:04<00:00, 38.20it/s]
100%|██████████| 20/20 [00:00<00:00, 91.09it/s]


Epoch: 23 | Time: 0m 11s
	Training Loss: 2.476 | Training perplexity:   11.89
	 Validation Loss: 3.563 |  Validation perplexity:   35.25


100%|██████████| 157/157 [00:03<00:00, 39.94it/s]
100%|██████████| 20/20 [00:00<00:00, 86.34it/s]


Epoch: 24 | Time: 0m 11s
	Training Loss: 2.464 | Training perplexity:   11.75
	 Validation Loss: 3.503 |  Validation perplexity:   33.22


100%|██████████| 157/157 [00:04<00:00, 37.53it/s]
100%|██████████| 20/20 [00:00<00:00, 90.35it/s]


Epoch: 25 | Time: 0m 11s
	Training Loss: 2.448 | Training perplexity:   11.57
	 Validation Loss: 3.528 |  Validation perplexity:   34.04


100%|██████████| 157/157 [00:04<00:00, 38.07it/s]
100%|██████████| 20/20 [00:00<00:00, 89.44it/s]


Epoch: 26 | Time: 0m 11s
	Training Loss: 2.408 | Training perplexity:   11.12
	 Validation Loss: 3.572 |  Validation perplexity:   35.58


100%|██████████| 157/157 [00:03<00:00, 40.31it/s]
100%|██████████| 20/20 [00:00<00:00, 90.74it/s]


Epoch: 27 | Time: 0m 11s
	Training Loss: 2.412 | Training perplexity:   11.15
	 Validation Loss: 3.520 |  Validation perplexity:   33.78


100%|██████████| 157/157 [00:03<00:00, 39.78it/s]
100%|██████████| 20/20 [00:00<00:00, 85.97it/s] 


Epoch: 28 | Time: 0m 11s
	Training Loss: 2.394 | Training perplexity:   10.95
	 Validation Loss: 3.603 |  Validation perplexity:   36.70


100%|██████████| 157/157 [00:04<00:00, 39.15it/s]
100%|██████████| 20/20 [00:00<00:00, 89.67it/s]


Epoch: 29 | Time: 0m 11s
	Training Loss: 2.363 | Training perplexity:   10.62
	 Validation Loss: 3.565 |  Validation perplexity:   35.35


100%|██████████| 157/157 [00:04<00:00, 38.41it/s]
100%|██████████| 20/20 [00:00<00:00, 89.58it/s]

Epoch: 30 | Time: 0m 11s
	Training Loss: 2.359 | Training perplexity:   10.58
	 Validation Loss: 3.562 |  Validation perplexity:   35.23






### Translation - Evaluation

In [49]:
# Translation!!!! YESS!!!
def translateSentence(inputSentence, srcLanguageField, trgLanguageField, model, device, maxLength = 50000):
    
    # Set to evaluation mode
    model.eval()
    
    # Tokenization
    # If input sentence is a string
    if isinstance(inputSentence, str):
        # Tokenize with spacy
        nlp = spacy.load('en')
        tokens = [token.text.lower() for token in nlp(inputSentence)]
    # If input are tokens
    else:
        # Convert into lower case
        tokens = [token.lower() for token in inputSentence]
    
    
    # Tokens are set to '' + token + ''
    tokens = [srcLanguageField.init_token] + tokens + [srcLanguageField.eos_token]
    
    # Convert token into numerical index using src vocab string to index mapping
    srcIndexes = [srcLanguageField.vocab.stoi[token] for token in tokens]
    
    # Convert into tensor and add a dimension (batch size) on the first dimension 
    srcTensor = torch.LongTensor(srcIndexes).unsqueeze(0).to(device)
    
    # Create source mask
    srcMask = model.createSrcMask(srcTensor)
    
    # Encoding
    # Source tensor is passed through the encoder
    with torch.no_grad():
        encoderSrc = model.encoder(srcTensor, srcMask)
    
    # Init target sequence from trg vocab with regards to the init_token
    trgIndexes = [trgLanguageField.vocab.stoi[trgLanguageField.init_token]]
    
    
    # Iterate up to max length
    for i in range(maxLength):
        # Convert sequence into tensor
        trgTensor = torch.LongTensor(trgIndexes).unsqueeze(0).to(device)
        # Create padding mask and lookahead mask
        trgMask = model.createTrgMask(trgTensor)
        
        # Decoding
        with torch.no_grad():
            # Generates output token and attention using the current target sequence and encoded src
            output, attention = model.decoder(trgTensor, encoderSrc, trgMask, srcMask)
        
        # Predicted token, takes token with the highest probability
        predictedToken = output.argmax(2)[:,-1].item()
        # Add it to target sequence
        trgIndexes.append(predictedToken)
        
        # Break out of the iterative loop if eos token is generated
        if predictedToken == trgLanguageField.vocab.stoi[trgLanguageField.eos_token]:
            break
    
    # Indexes are converted back to string
    trgTokens = [trgLanguageField.vocab.itos[i] for i in trgIndexes]
    
    # Return the output
    return trgTokens[1:], attention

In [50]:
model.load_state_dict(torch.load('Model/FYPSeq2SeqModel.pt'))

  model.load_state_dict(torch.load('Model/FYPSeq2SeqModel.pt'))


<All keys matched successfully>

In [51]:
inputSequence = Input
targetSequence = Output

In [52]:
def Translate4MePlz_en2py(src):
    src = src.split(" ")
    translation, attention = translateSentence(src, inputSequence, targetSequence, model, device)

    print(f'Here is the translation for you kind person : \n')
    print(untokenize(translation[:-1]).decode('utf-8'))

In [53]:
translateMe = 'Add two numbers'
Translate4MePlz_en2py(translateMe)

Here is the translation for you kind person : 

num1 =1.5 
num2 =6.3 
sum =num1 +num2 
print (f'Sum: {sum}')


In [54]:
translateMe2 = 'Write a python script to generates random numbers between 0 and 9 that are divisible by 3'
Translate4MePlz_en2py(translateMe2)

Here is the translation for you kind person : 

def var1 (var2 ):
    if var2 ==0 :
        return True 
    else :
        return False 


In [55]:
translateMe3 = 'Write a Python program to calculate the average of a list of positive integers and output the result'
Translate4MePlz_en2py(translateMe3)

Here is the translation for you kind person : 


import var1 

var1 =[]

var2 =[]
for i in range (0 ,len (var1 )):
    var2 +=1 

for j in range (0 ,len (var1 )):
    var2 +=var1 [j ]

print (var2 )


In [56]:
translateMe4 = 'function in Python that prints out the Pascal triangle for a given number of rows'
Translate4MePlz_en2py(translateMe4)

Here is the translation for you kind person : 

def var1 (var2 ):
    for i in range (len (var2 )):
        var2 [i ]=var2 [i ]
        var2 [i ]=var2 [i ]
        while var2 >0 and var2 %2 ==0 :
            var2 +=1 
            var2 +=1 
            var2 [var2 -1 
    return var2 




In [57]:
translateMe5 = 'Calculate the volume of a cube'
Translate4MePlz_en2py(translateMe5)

Here is the translation for you kind person : 

var1 =10 
**3 
