In [1]:
import os
import re

In [2]:
DIR_path = "D:\MSc Data Science\Advanced Modules\[INF-DSAM1B] Advanced Machine Learning B\Deep learning for NLP\Project\Machine translation with attention"
english_data_path = "Data\\es-en"
spanish_data_path = "Data\\es-en"

In [3]:
# 'utf-8' removes b'' character string literal
# splitlines() remove newline character
with open(os.path.join(DIR_path, english_data_path, "europarl-v7.es-en.en"), "rb") as f:
    content_english = f.read().decode("utf-8").splitlines()


In [4]:
with open(os.path.join(DIR_path, spanish_data_path, "europarl-v7.es-en.es"), "rb") as f:
    content_spanish = f.read().decode("utf-8").splitlines()

In [5]:
def sent_preprocess(sentence):
    sentence=sentence.lower()             
    sentence = re.sub(r"[-,.!?()]+", r"", sentence)
    return sentence

_patterns = [r'\'',
             r'\"',
             r'\.',
             r'<br \/>',
             r',',
             r'\(',
             r'\)',
             r'\!',
             r'\?',
             r'\;',
             r'\:',
             r'\s+']

_replacements = [' \'  ',
                 '',
                 ' . ',
                 ' ',
                 ' , ',
                 ' ( ',
                 ' ) ',
                 ' ! ',
                 ' ? ',
                 ' ',
                 ' ',
                 ' ']

_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))

def sentence_preprocess(sentence):
    """https://pytorch.org/text/_modules/torchtext/data/utils.html"""
    sentence = sentence.lower()
    for pattern_re, replaced_str in _patterns_dict:
        sentence = pattern_re.sub(replaced_str, sentence)
    
    return sentence

In [6]:
# preprocess the english sentence
sentence_english = []
for sent in content_english:
    sentence_english.append(sentence_preprocess(sent))
print("total english sentences: ", len(sentence_english))

total english sentences:  1965734


In [7]:
# preprocess the spanish sentence
sentence_spanish = []
for sent in content_spanish:
    sentence_spanish.append(sentence_preprocess(sent))
print("total spanish sentences: ", len(sentence_spanish))

total spanish sentences:  1965734


### Tokenize the data

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk
from tqdm import tqdm
import pickle

In [9]:
# Loop over wach of the sentence and tokenize eacch sentenec separately.
# will take some time to tokenize each sentence.
english_tokenized_text = [ nltk.word_tokenize(sentence_english[i], language="english") for i in tqdm(range(len(sentence_english[:500]))) ]

# create word index
# assign each word a number.
word_to_index_eng = {}
words=[]
for sentence in english_tokenized_text:
    for word in sentence:
        words.append(word)
UNIQUE_WORDS = set(words)

for index, word in enumerate(UNIQUE_WORDS):
    word_to_index_eng[word] = index

# add tokens: <SOS> and <EOS>
word_to_index_eng["<SOS>"] = list(word_to_index_eng.values())[-1] + 1
word_to_index_eng["<EOS>"] = list(word_to_index_eng.values())[-1] + 1

# using word index, create tensor
# convert each of the sentence into numbers.
english_tokenized_tensor = []

for sentence in english_tokenized_text:
    #english_tokenized_tensor.append( [word_to_index[word] for word in sentence]  )
    tensor_list=[]
    tensor_list.append(word_to_index_eng["<SOS>"])
    tensor_list = tensor_list + [word_to_index_eng[word] for word in sentence]
    tensor_list.append(word_to_index_eng["<EOS>"])

    english_tokenized_tensor.append(torch.tensor(tensor_list, dtype=torch.long))

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 1828.11it/s]


In [10]:
# Loop over wach of the sentence and tokenize eacch sentenec separately.
# will take some time to tokenize each sentence.
spanish_tokenized_text = [ nltk.word_tokenize(sentence_spanish[i], language="spanish") for i in tqdm(range(len(sentence_spanish[:500]))) ]

# create word index
# assign each word a number.
word_to_index_spanish = {}
words=[]
for sentence in spanish_tokenized_text:
    for word in sentence:
        words.append(word)
UNIQUE_WORDS = set(words)

for index, word in enumerate(UNIQUE_WORDS):
    word_to_index_spanish[word] = index

# add tokens: <SOS> and <EOS>
word_to_index_spanish["<SOS>"] = list(word_to_index_spanish.values())[-1] + 1
word_to_index_spanish["<EOS>"] = list(word_to_index_spanish.values())[-1] + 1

# using word index, create tensor
# convert each of the sentence into numbers.
spanish_tokenized_tensor = []

for sentence in spanish_tokenized_text:
    
    tensor_list=[]
    tensor_list.append(word_to_index_spanish["<SOS>"])
    tensor_list = tensor_list + [word_to_index_spanish[word] for word in sentence]
    tensor_list.append(word_to_index_spanish["<EOS>"])
    spanish_tokenized_tensor.append(torch.tensor(tensor_list, dtype=torch.long))


100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 1414.93it/s]


# Model

In [11]:
# For cuda.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
BATCH_SIZE = 16
MAX_SENTENCE_LENGTH = 50
EMBEDDING_DIM = 30
HIDDEN_DIM = 1000
DECODER_HIDDEN_DIM = 1000
EPOCHS = 1

EMBEDDING_SIZE_ENGLISH = len(word_to_index_eng)
EMBEDDING_SIZE_SPANISH = len(word_to_index_spanish)

In [18]:
def padding(english_tokenized_tensor, spanish_tokenized_tensor):
    
    spanish_batch=[]
    for batch in spanish_tokenized_tensor:
        if(len(batch) > MAX_SENTENCE_LENGTH):
            batch = batch[0:MAX_SENTENCE_LENGTH]
        if( (MAX_SENTENCE_LENGTH - batch.shape[0]) != 0 ) :
            spanish_batch.append(torch.cat((batch, torch.LongTensor([0]).repeat(MAX_SENTENCE_LENGTH - len(batch))), dim=0) )
        else:
            spanish_batch.append(batch)
       
    english_batch=[]
    for batch in english_tokenized_tensor:
        # only consider top 50 words in each sentences.
        if(len(batch) > MAX_SENTENCE_LENGTH):
            batch = batch[0:MAX_SENTENCE_LENGTH]
        if( (MAX_SENTENCE_LENGTH - batch.shape[0]) != 0 ) :
            english_batch.append(torch.cat((batch, torch.LongTensor([0]).repeat(MAX_SENTENCE_LENGTH - len(batch))), dim=0) )
        else:
            english_batch.append(batch)

    # the first element of the batchholds the first elements of every sequence in the batch. 
    # the second element of the batch is going to hold the second element of every sequence in the batch, and so on.
    # shape : [sequence_length, batch_size]
    spanish_batch = torch.transpose(torch.stack(spanish_batch), 0, 1)
    english_batch = torch.transpose(torch.stack(english_batch), 0, 1)
    
    return english_batch, spanish_batch

In [19]:
class Encoder(nn.Module):
    def __init__(self, EMBEDDING_DIM, EMBEDDING_SIZE_ENGLISH, HIDDEN_DIM, DECODER_HIDDEN_DIM):
        super(Encoder, self).__init__()

        self.embedding_dim = EMBEDDING_DIM
        self.embedding_size_english = EMBEDDING_SIZE_ENGLISH
        self.hidden_dim = HIDDEN_DIM


        self.embedding = nn.Embedding( self.embedding_size_english , self.embedding_dim)

        self.rnn = nn.RNN(self.embedding_dim, self.hidden_dim, bidirectional =True)

        self.linear = nn.Linear(self.hidden_dim*2, DECODER_HIDDEN_DIM)

    def forward(self, inputs):
        # WRITE CODE HERE
        # input : [sequence_len, batch_size]
        embeds = self.embedding(inputs)
        # https://stackoverflow.com/questions/49466894/how-to-correctly-give-inputs-to-embedding-lstm-and-linear-layers-in-pytorch
        # output of the embedding is [seq_len, batch_size, embedding_size]

        #input: [seq_len, batch_size, embedding_size]
        rnn_enc, hidden = self.rnn(embeds)
        # rnn out shape : [sequence_len, batch_size, hidden_dim*2]
        # hidden shape   : [2, batch_size, hidden_dim]

        # concatenate both forward and backward hidden vectors
        hidden_f_b = torch.cat((hidden[0,:,:], hidden[1,:,:]), dim = 1)
        # output shape: [batch_size, hidden_dim*2]

        # input: [batch_size, hidden_dim*2]
        hidden_enc = self.linear(hidden_f_b)
        # output: [batch_size, DECODER_HIDDEN_DIM]

        hidden_enc = torch.tanh(hidden_enc)

        return rnn_enc, hidden_enc

In [20]:
class Attention(nn.Module):
    def __init__(self, DECODER_HIDDEN_DIM, HIDDEN_DIM):
        super(Attention, self).__init__()


    def forward(self, rnn_enc, hidden_enc):
        # WRITE CODE HERE

        # rnn_enc output shape : [sequence_len, batch_size, hidden_dim*2]
        # hidden_enc shape:      [batch_size, DECODER_HIDDEN_DIM]

        # repeat encoder hidden state sequence_len Time.
        #print("seq len", rnn_enc.shape[0])
        #print(hidden_enc.shape)
        hidden_enc = torch.unsqueeze(hidden_enc, 1)
        # [batch_size, 1, DECODER_HIDDEN_DIM]

        hidden_enc = hidden_enc.repeat(1, rnn_enc.shape[0], 1)
        # [batch_size, sequence_len, DECODER_HIDDEN_DIM]


        
        print(hidden_enc)

In [21]:
# encoder model
model = Encoder(EMBEDDING_DIM, EMBEDDING_SIZE_ENGLISH, HIDDEN_DIM, DECODER_HIDDEN_DIM)
model.to(device)

model_attn = Attention(DECODER_HIDDEN_DIM, HIDDEN_DIM)
model_attn.to(device)

optimizer = optim.Adam(model.parameters(), lr = 0.001)
loss_function = nn.NLLLoss()

In [22]:
for epochs in range(EPOCHS):
    total_loss = 0
    model.to(device)
    for i in range(0, len(english_tokenized_tensor), BATCH_SIZE):
        english_batch = english_tokenized_tensor[i:i+BATCH_SIZE] 
        spanish_batch = spanish_tokenized_tensor[i:i+BATCH_SIZE] 

        en, es = padding(english_batch, spanish_batch) # shape : [sequence_length, batch_size]

        en = en.to(device)
        es = es.to(device)

        encoder_output, enc_hidden = model(en)
        
        model_attn(encoder_output, enc_hidden)

tensor([[[-0.0422, -0.0480,  0.0376,  ..., -0.0148,  0.0584, -0.1154],
         [-0.0422, -0.0480,  0.0376,  ..., -0.0148,  0.0584, -0.1154],
         [-0.0422, -0.0480,  0.0376,  ..., -0.0148,  0.0584, -0.1154],
         ...,
         [-0.0422, -0.0480,  0.0376,  ..., -0.0148,  0.0584, -0.1154],
         [-0.0422, -0.0480,  0.0376,  ..., -0.0148,  0.0584, -0.1154],
         [-0.0422, -0.0480,  0.0376,  ..., -0.0148,  0.0584, -0.1154]],

        [[ 0.0487, -0.0785,  0.0292,  ..., -0.0381,  0.0597, -0.0567],
         [ 0.0487, -0.0785,  0.0292,  ..., -0.0381,  0.0597, -0.0567],
         [ 0.0487, -0.0785,  0.0292,  ..., -0.0381,  0.0597, -0.0567],
         ...,
         [ 0.0487, -0.0785,  0.0292,  ..., -0.0381,  0.0597, -0.0567],
         [ 0.0487, -0.0785,  0.0292,  ..., -0.0381,  0.0597, -0.0567],
         [ 0.0487, -0.0785,  0.0292,  ..., -0.0381,  0.0597, -0.0567]],

        [[-0.0326, -0.0300,  0.0417,  ...,  0.0114,  0.0195, -0.1119],
         [-0.0326, -0.0300,  0.0417,  ...,  0

       device='cuda:0', grad_fn=<RepeatBackward>)
tensor([[[-0.0509, -0.0316,  0.0215,  ..., -0.0643,  0.0201, -0.0918],
         [-0.0509, -0.0316,  0.0215,  ..., -0.0643,  0.0201, -0.0918],
         [-0.0509, -0.0316,  0.0215,  ..., -0.0643,  0.0201, -0.0918],
         ...,
         [-0.0509, -0.0316,  0.0215,  ..., -0.0643,  0.0201, -0.0918],
         [-0.0509, -0.0316,  0.0215,  ..., -0.0643,  0.0201, -0.0918],
         [-0.0509, -0.0316,  0.0215,  ..., -0.0643,  0.0201, -0.0918]],

        [[-0.0307, -0.0406,  0.0007,  ..., -0.0680,  0.0110, -0.0817],
         [-0.0307, -0.0406,  0.0007,  ..., -0.0680,  0.0110, -0.0817],
         [-0.0307, -0.0406,  0.0007,  ..., -0.0680,  0.0110, -0.0817],
         ...,
         [-0.0307, -0.0406,  0.0007,  ..., -0.0680,  0.0110, -0.0817],
         [-0.0307, -0.0406,  0.0007,  ..., -0.0680,  0.0110, -0.0817],
         [-0.0307, -0.0406,  0.0007,  ..., -0.0680,  0.0110, -0.0817]],

        [[-0.0304, -0.0286,  0.0090,  ..., -0.0737,  0.0427, -0.08

tensor([[[ 0.0080, -0.0508,  0.0041,  ..., -0.0675,  0.0732, -0.1240],
         [ 0.0080, -0.0508,  0.0041,  ..., -0.0675,  0.0732, -0.1240],
         [ 0.0080, -0.0508,  0.0041,  ..., -0.0675,  0.0732, -0.1240],
         ...,
         [ 0.0080, -0.0508,  0.0041,  ..., -0.0675,  0.0732, -0.1240],
         [ 0.0080, -0.0508,  0.0041,  ..., -0.0675,  0.0732, -0.1240],
         [ 0.0080, -0.0508,  0.0041,  ..., -0.0675,  0.0732, -0.1240]],

        [[-0.0252, -0.0099,  0.0264,  ..., -0.0550,  0.0203, -0.1343],
         [-0.0252, -0.0099,  0.0264,  ..., -0.0550,  0.0203, -0.1343],
         [-0.0252, -0.0099,  0.0264,  ..., -0.0550,  0.0203, -0.1343],
         ...,
         [-0.0252, -0.0099,  0.0264,  ..., -0.0550,  0.0203, -0.1343],
         [-0.0252, -0.0099,  0.0264,  ..., -0.0550,  0.0203, -0.1343],
         [-0.0252, -0.0099,  0.0264,  ..., -0.0550,  0.0203, -0.1343]],

        [[ 0.0141, -0.0402, -0.0011,  ..., -0.0531,  0.0817, -0.1280],
         [ 0.0141, -0.0402, -0.0011,  ..., -0

tensor([[[-0.0015, -0.0340,  0.0095,  ..., -0.0257,  0.0393, -0.0937],
         [-0.0015, -0.0340,  0.0095,  ..., -0.0257,  0.0393, -0.0937],
         [-0.0015, -0.0340,  0.0095,  ..., -0.0257,  0.0393, -0.0937],
         ...,
         [-0.0015, -0.0340,  0.0095,  ..., -0.0257,  0.0393, -0.0937],
         [-0.0015, -0.0340,  0.0095,  ..., -0.0257,  0.0393, -0.0937],
         [-0.0015, -0.0340,  0.0095,  ..., -0.0257,  0.0393, -0.0937]],

        [[-0.0487, -0.0231,  0.0020,  ..., -0.0527,  0.0511, -0.1198],
         [-0.0487, -0.0231,  0.0020,  ..., -0.0527,  0.0511, -0.1198],
         [-0.0487, -0.0231,  0.0020,  ..., -0.0527,  0.0511, -0.1198],
         ...,
         [-0.0487, -0.0231,  0.0020,  ..., -0.0527,  0.0511, -0.1198],
         [-0.0487, -0.0231,  0.0020,  ..., -0.0527,  0.0511, -0.1198],
         [-0.0487, -0.0231,  0.0020,  ..., -0.0527,  0.0511, -0.1198]],

        [[-0.0114, -0.0428,  0.0099,  ..., -0.0641,  0.0661, -0.0473],
         [-0.0114, -0.0428,  0.0099,  ..., -0

tensor([[[-5.2155e-02, -2.0373e-02,  1.7674e-02,  ..., -3.5449e-02,
           3.7767e-03, -4.8972e-02],
         [-5.2155e-02, -2.0373e-02,  1.7674e-02,  ..., -3.5449e-02,
           3.7767e-03, -4.8972e-02],
         [-5.2155e-02, -2.0373e-02,  1.7674e-02,  ..., -3.5449e-02,
           3.7767e-03, -4.8972e-02],
         ...,
         [-5.2155e-02, -2.0373e-02,  1.7674e-02,  ..., -3.5449e-02,
           3.7767e-03, -4.8972e-02],
         [-5.2155e-02, -2.0373e-02,  1.7674e-02,  ..., -3.5449e-02,
           3.7767e-03, -4.8972e-02],
         [-5.2155e-02, -2.0373e-02,  1.7674e-02,  ..., -3.5449e-02,
           3.7767e-03, -4.8972e-02]],

        [[ 2.4238e-02, -5.4668e-02,  3.9373e-02,  ..., -1.0556e-02,
           2.0141e-02, -8.0061e-02],
         [ 2.4238e-02, -5.4668e-02,  3.9373e-02,  ..., -1.0556e-02,
           2.0141e-02, -8.0061e-02],
         [ 2.4238e-02, -5.4668e-02,  3.9373e-02,  ..., -1.0556e-02,
           2.0141e-02, -8.0061e-02],
         ...,
         [ 2.4238e-02, -5

       device='cuda:0', grad_fn=<RepeatBackward>)
tensor([[[-0.0580, -0.0221,  0.0050,  ..., -0.0402,  0.0484, -0.1103],
         [-0.0580, -0.0221,  0.0050,  ..., -0.0402,  0.0484, -0.1103],
         [-0.0580, -0.0221,  0.0050,  ..., -0.0402,  0.0484, -0.1103],
         ...,
         [-0.0580, -0.0221,  0.0050,  ..., -0.0402,  0.0484, -0.1103],
         [-0.0580, -0.0221,  0.0050,  ..., -0.0402,  0.0484, -0.1103],
         [-0.0580, -0.0221,  0.0050,  ..., -0.0402,  0.0484, -0.1103]],

        [[-0.0529, -0.0224,  0.0030,  ..., -0.0175,  0.0481, -0.1211],
         [-0.0529, -0.0224,  0.0030,  ..., -0.0175,  0.0481, -0.1211],
         [-0.0529, -0.0224,  0.0030,  ..., -0.0175,  0.0481, -0.1211],
         ...,
         [-0.0529, -0.0224,  0.0030,  ..., -0.0175,  0.0481, -0.1211],
         [-0.0529, -0.0224,  0.0030,  ..., -0.0175,  0.0481, -0.1211],
         [-0.0529, -0.0224,  0.0030,  ..., -0.0175,  0.0481, -0.1211]],

        [[-0.0558, -0.0203,  0.0052,  ..., -0.0416,  0.0505, -0.11

       device='cuda:0', grad_fn=<RepeatBackward>)
tensor([[[-8.2053e-02, -3.7341e-02,  4.8990e-02,  ..., -4.5118e-02,
           4.9457e-02, -1.0356e-01],
         [-8.2053e-02, -3.7341e-02,  4.8990e-02,  ..., -4.5118e-02,
           4.9457e-02, -1.0356e-01],
         [-8.2053e-02, -3.7341e-02,  4.8990e-02,  ..., -4.5118e-02,
           4.9457e-02, -1.0356e-01],
         ...,
         [-8.2053e-02, -3.7341e-02,  4.8990e-02,  ..., -4.5118e-02,
           4.9457e-02, -1.0356e-01],
         [-8.2053e-02, -3.7341e-02,  4.8990e-02,  ..., -4.5118e-02,
           4.9457e-02, -1.0356e-01],
         [-8.2053e-02, -3.7341e-02,  4.8990e-02,  ..., -4.5118e-02,
           4.9457e-02, -1.0356e-01]],

        [[-5.3429e-03,  4.0926e-02,  6.2660e-02,  ..., -1.8198e-01,
           1.3304e-02, -3.5736e-02],
         [-5.3429e-03,  4.0926e-02,  6.2660e-02,  ..., -1.8198e-01,
           1.3304e-02, -3.5736e-02],
         [-5.3429e-03,  4.0926e-02,  6.2660e-02,  ..., -1.8198e-01,
           1.3304e-02, -3.

       device='cuda:0', grad_fn=<RepeatBackward>)
tensor([[[-0.0341, -0.0368,  0.0092,  ..., -0.0436,  0.0649, -0.1080],
         [-0.0341, -0.0368,  0.0092,  ..., -0.0436,  0.0649, -0.1080],
         [-0.0341, -0.0368,  0.0092,  ..., -0.0436,  0.0649, -0.1080],
         ...,
         [-0.0341, -0.0368,  0.0092,  ..., -0.0436,  0.0649, -0.1080],
         [-0.0341, -0.0368,  0.0092,  ..., -0.0436,  0.0649, -0.1080],
         [-0.0341, -0.0368,  0.0092,  ..., -0.0436,  0.0649, -0.1080]],

        [[-0.0372, -0.0553, -0.0013,  ..., -0.0426,  0.0479, -0.1225],
         [-0.0372, -0.0553, -0.0013,  ..., -0.0426,  0.0479, -0.1225],
         [-0.0372, -0.0553, -0.0013,  ..., -0.0426,  0.0479, -0.1225],
         ...,
         [-0.0372, -0.0553, -0.0013,  ..., -0.0426,  0.0479, -0.1225],
         [-0.0372, -0.0553, -0.0013,  ..., -0.0426,  0.0479, -0.1225],
         [-0.0372, -0.0553, -0.0013,  ..., -0.0426,  0.0479, -0.1225]],

        [[-0.0550, -0.0400,  0.0003,  ..., -0.0160,  0.0679, -0.12

tensor([[[-0.0656, -0.0638,  0.0393,  ..., -0.0869,  0.0665, -0.2030],
         [-0.0656, -0.0638,  0.0393,  ..., -0.0869,  0.0665, -0.2030],
         [-0.0656, -0.0638,  0.0393,  ..., -0.0869,  0.0665, -0.2030],
         ...,
         [-0.0656, -0.0638,  0.0393,  ..., -0.0869,  0.0665, -0.2030],
         [-0.0656, -0.0638,  0.0393,  ..., -0.0869,  0.0665, -0.2030],
         [-0.0656, -0.0638,  0.0393,  ..., -0.0869,  0.0665, -0.2030]],

        [[-0.0038, -0.0300,  0.0465,  ..., -0.0176,  0.0360, -0.1079],
         [-0.0038, -0.0300,  0.0465,  ..., -0.0176,  0.0360, -0.1079],
         [-0.0038, -0.0300,  0.0465,  ..., -0.0176,  0.0360, -0.1079],
         ...,
         [-0.0038, -0.0300,  0.0465,  ..., -0.0176,  0.0360, -0.1079],
         [-0.0038, -0.0300,  0.0465,  ..., -0.0176,  0.0360, -0.1079],
         [-0.0038, -0.0300,  0.0465,  ..., -0.0176,  0.0360, -0.1079]],

        [[-0.0377, -0.0617,  0.0157,  ..., -0.0490,  0.0316, -0.0797],
         [-0.0377, -0.0617,  0.0157,  ..., -0

       device='cuda:0', grad_fn=<RepeatBackward>)
