In [1]:
import os
import re

In [2]:
DIR_path = "D:\MSc Data Science\Advanced Modules\[INF-DSAM1B] Advanced Machine Learning B\Deep learning for NLP\Project\Machine translation with attention"
english_data_path = "Data\\es-en"
spanish_data_path = "Data\\es-en"

In [3]:
# 'utf-8' removes b'' character string literal
# splitlines() remove newline character
with open(os.path.join(DIR_path, english_data_path, "europarl-v7.es-en.en"), "rb") as f:
    content_english = f.read().decode("utf-8").splitlines()


In [4]:
with open(os.path.join(DIR_path, spanish_data_path, "europarl-v7.es-en.es"), "rb") as f:
    content_spanish = f.read().decode("utf-8").splitlines()

In [5]:
def sent_preprocess(sentence):
    sentence=sentence.lower()             
    sentence = re.sub(r"[-,.!?()]+", r"", sentence)
    return sentence

_patterns = [r'\'',
             r'\"',
             r'\.',
             r'<br \/>',
             r',',
             r'\(',
             r'\)',
             r'\!',
             r'\?',
             r'\;',
             r'\:',
             r'\s+']

_replacements = [' \'  ',
                 '',
                 ' . ',
                 ' ',
                 ' , ',
                 ' ( ',
                 ' ) ',
                 ' ! ',
                 ' ? ',
                 ' ',
                 ' ',
                 ' ']

_patterns_dict = list((re.compile(p), r) for p, r in zip(_patterns, _replacements))

def sentence_preprocess(sentence):
    """https://pytorch.org/text/_modules/torchtext/data/utils.html"""
    sentence = sentence.lower()
    for pattern_re, replaced_str in _patterns_dict:
        sentence = pattern_re.sub(replaced_str, sentence)
    
    return sentence

In [6]:
# preprocess the english sentence
sentence_english = []
for sent in content_english:
    sentence_english.append(sentence_preprocess(sent))
print("total english sentences: ", len(sentence_english))

total english sentences:  1965734


In [7]:
# preprocess the spanish sentence
sentence_spanish = []
for sent in content_spanish:
    sentence_spanish.append(sentence_preprocess(sent))
print("total spanish sentences: ", len(sentence_spanish))

total spanish sentences:  1965734


### Tokenize the data

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk
from tqdm import tqdm
import pickle

In [9]:
# Loop over wach of the sentence and tokenize eacch sentenec separately.
# will take some time to tokenize each sentence.
english_tokenized_text = [ nltk.word_tokenize(sentence_english[i], language="english") for i in tqdm(range(len(sentence_english[:500]))) ]

# create word index
# assign each word a number.
word_to_index_eng = {}
words=[]
for sentence in english_tokenized_text:
    for word in sentence:
        words.append(word)
UNIQUE_WORDS = set(words)

for index, word in enumerate(UNIQUE_WORDS):
    word_to_index_eng[word] = index

# add tokens: <SOS> and <EOS>
word_to_index_eng["<SOS>"] = list(word_to_index_eng.values())[-1] + 1
word_to_index_eng["<EOS>"] = list(word_to_index_eng.values())[-1] + 1

# using word index, create tensor
# convert each of the sentence into numbers.
english_tokenized_tensor = []

for sentence in english_tokenized_text:
    #english_tokenized_tensor.append( [word_to_index[word] for word in sentence]  )
    tensor_list=[]
    tensor_list.append(word_to_index_eng["<SOS>"])
    tensor_list = tensor_list + [word_to_index_eng[word] for word in sentence]
    tensor_list.append(word_to_index_eng["<EOS>"])

    english_tokenized_tensor.append(torch.tensor(tensor_list, dtype=torch.long))

100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 4774.61it/s]


In [10]:
# Loop over wach of the sentence and tokenize eacch sentenec separately.
# will take some time to tokenize each sentence.
spanish_tokenized_text = [ nltk.word_tokenize(sentence_spanish[i], language="spanish") for i in tqdm(range(len(sentence_spanish[:500]))) ]

# create word index
# assign each word a number.
word_to_index_spanish = {}
words=[]
for sentence in spanish_tokenized_text:
    for word in sentence:
        words.append(word)
UNIQUE_WORDS = set(words)

for index, word in enumerate(UNIQUE_WORDS):
    word_to_index_spanish[word] = index

# add tokens: <SOS> and <EOS>
word_to_index_spanish["<SOS>"] = list(word_to_index_spanish.values())[-1] + 1
word_to_index_spanish["<EOS>"] = list(word_to_index_spanish.values())[-1] + 1

# using word index, create tensor
# convert each of the sentence into numbers.
spanish_tokenized_tensor = []

for sentence in spanish_tokenized_text:
    
    tensor_list=[]
    tensor_list.append(word_to_index_spanish["<SOS>"])
    tensor_list = tensor_list + [word_to_index_spanish[word] for word in sentence]
    tensor_list.append(word_to_index_spanish["<EOS>"])
    spanish_tokenized_tensor.append(torch.tensor(tensor_list, dtype=torch.long))


100%|██████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 4597.97it/s]


# Model

In [11]:
# For cuda.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
BATCH_SIZE = 16
MAX_SENTENCE_LENGTH = 50
EMBEDDING_DIM = 30
ENCODER_HIDDEN_DIM = 1000
DECODER_HIDDEN_DIM = 1000
EPOCHS = 1

EMBEDDING_SIZE_ENGLISH = len(word_to_index_eng)
EMBEDDING_SIZE_SPANISH = len(word_to_index_spanish)

In [13]:
def padding(english_tokenized_tensor, spanish_tokenized_tensor):
    
    spanish_batch=[]
    for batch in spanish_tokenized_tensor:
        if(len(batch) > MAX_SENTENCE_LENGTH):
            batch = batch[0:MAX_SENTENCE_LENGTH]
        if( (MAX_SENTENCE_LENGTH - batch.shape[0]) != 0 ) :
            spanish_batch.append(torch.cat((batch, torch.LongTensor([0]).repeat(MAX_SENTENCE_LENGTH - len(batch))), dim=0) )
        else:
            spanish_batch.append(batch)
       
    english_batch=[]
    for batch in english_tokenized_tensor:
        # only consider top 50 words in each sentences.
        if(len(batch) > MAX_SENTENCE_LENGTH):
            batch = batch[0:MAX_SENTENCE_LENGTH]
        if( (MAX_SENTENCE_LENGTH - batch.shape[0]) != 0 ) :
            english_batch.append(torch.cat((batch, torch.LongTensor([0]).repeat(MAX_SENTENCE_LENGTH - len(batch))), dim=0) )
        else:
            english_batch.append(batch)

    # the first element of the batchholds the first elements of every sequence in the batch. 
    # the second element of the batch is going to hold the second element of every sequence in the batch, and so on.
    # shape : [sequence_length, batch_size]
    spanish_batch = torch.transpose(torch.stack(spanish_batch), 0, 1)
    english_batch = torch.transpose(torch.stack(english_batch), 0, 1)
    
    return english_batch, spanish_batch

In [328]:
class Encoder(nn.Module):
    def __init__(self, EMBEDDING_DIM, EMBEDDING_SIZE_ENGLISH, ENCODER_HIDDEN_DIM, DECODER_HIDDEN_DIM):
        super(Encoder, self).__init__()

        self.embedding_dim = EMBEDDING_DIM
        self.embedding_size_english = EMBEDDING_SIZE_ENGLISH
        self.hidden_dim = ENCODER_HIDDEN_DIM

        self.embedding = nn.Embedding( self.embedding_size_english , self.embedding_dim)

        self.rnn = nn.RNN(self.embedding_dim, self.hidden_dim, bidirectional =True)

        self.linear = nn.Linear(self.hidden_dim*2, DECODER_HIDDEN_DIM)

    def forward(self, inputs):
        # WRITE CODE HERE
        # input : [sequence_len, batch_size]
        embeds = self.embedding(inputs)
        # https://stackoverflow.com/questions/49466894/how-to-correctly-give-inputs-to-embedding-lstm-and-linear-layers-in-pytorch
        # output of the embedding is [seq_len, batch_size, embedding_size]

        #input: [seq_len, batch_size, embedding_size]
        rnn_enc, hidden = self.rnn(embeds)
        # rnn out shape : [sequence_len, batch_size, ENCODER_HIDDEN_DIM*2]
        # hidden shape   : [2, batch_size, ENCODER_HIDDEN_DIM]

        # concatenate both forward and backward hidden vectors
        hidden_f_b = torch.cat((hidden[0,:,:], hidden[1,:,:]), dim = 1)
        # output shape: [batch_size, ENCODER_HIDDEN_DIM*2]

        # input: [batch_size, ENCODER_HIDDEN_DIM*2]
        hidden_enc = self.linear(hidden_f_b)
        # output: [batch_size, DECODER_HIDDEN_DIM]

        hidden_enc = torch.tanh(hidden_enc)

        return rnn_enc, hidden_enc

In [329]:
class Attention(nn.Module):
    def __init__(self, DECODER_HIDDEN_DIM, ENCODER_HIDDEN_DIM):
        super(Attention, self).__init__()

        self.fc1 = nn.Linear((ENCODER_HIDDEN_DIM*2) + DECODER_HIDDEN_DIM, DECODER_HIDDEN_DIM)
        self.fc2 = nn.Linear(DECODER_HIDDEN_DIM, 1)
        
        
    def forward(self, encoder_output, decoder_hidden):
        # WRITE CODE HERE

        # encoder_output shape : [sequence_len, batch_size, ENCODER_HIDDEN_DIM*2]
        # decoder_hidden shape : [batch_size, DECODER_HIDDEN_DIM]

        # repeat decoder hidden state sequence_len Time's.       
        # input shape : [batch_size, DECODER_HIDDEN_DIM]
        decoder_hidden = torch.unsqueeze(decoder_hidden, 1)
        # output shape: [batch_size, 1, DECODER_HIDDEN_DIM]
        
        decoder_hidden = decoder_hidden.repeat(1, encoder_output.shape[0], 1)
        # [batch_size, sequence_len, DECODER_HIDDEN_DIM]
        
        encoder_output = encoder_output.permute(1, 0, 2)
        # encoder_output- shape : [batch_size, sequence_len, ENCODER_HIDDEN_DIM*2]
        
        # concatenate encoder's output and decoder's hidden state
        # and feed into a neural network layer
        concat = torch.cat((encoder_output, decoder_hidden), dim=2)
        # shape- [batch_size, sequence_len, (ENCODER_HIDDEN_DIM*2) + DECODER_HIDDEN_DIM], [batch_size, sequence_len, 3000]
        
        # input shape: [batch_size, sequence_len, (ENCODER_HIDDEN_DIM*2) + DECODER_HIDDEN_DIM]
        fc1 = self.fc1(concat)
        # output: [batch_size, sequence_len, DECODER_HIDDEN_DIM]
        
        fc1 = torch.tanh(fc1)
        # output: [batch_size, sequence_len, DECODER_HIDDEN_DIM]     
        
        fc2 = self.fc2(fc1)
        # output: [batch_size, sequence_len, 1]
        
        alpha = F.softmax(fc2, dim=1)
        # output: [batch_size, sequence_len, 1]
        
        # attention vector to take the weighted sum of the encoder hidden state.
        alpha=alpha.permute(0, 2, 1)
        # alpha shape:            [batch_size, 1, sequence_len]
        # encoder_output- shape : [batch_size, sequence_len, ENCODER_HIDDEN_DIM*2]
        # https://pytorch.org/docs/stable/generated/torch.bmm.html
        
        a=alpha@encoder_output     # multiplying all the words in each sequence, from 1..N, wher N=sequence len.
        # [batch_size, 1, sequence_len] * [batch_size, sequence_len, ENCODER_HIDDEN_DIM*2]
        # attention- shape : [batch_size, 1, ENCODER_HIDDEN_DIM*2]
        
        return a

In [333]:
class Decoder(nn.Module):
    def __init__(self, EMBEDDING_DIM, EMBEDDING_SIZE_SPANISH, ENCODER_HIDDEN_DIM, DECODER_HIDDEN_DIM):
        super(Decoder, self).__init__()
        
        
        self.embedding_dim = EMBEDDING_DIM
        self.embedding_size_spanish = EMBEDDING_SIZE_SPANISH
        self.hidden_dim = DECODER_HIDDEN_DIM
        self.encoder_hidden_dim = ENCODER_HIDDEN_DIM
        self.decoder_hidden_dim = DECODER_HIDDEN_DIM
        
        self.attention = Attention(self.decoder_hidden_dim, self.encoder_hidden_dim)
        
        self.embedding = nn.Embedding( self.embedding_size_spanish , self.embedding_dim)

        self.rnn = nn.RNN((self.encoder_hidden_dim*2)+self.embedding_dim, self.hidden_dim, bidirectional = False)

        self.linear = nn.Linear(self.embedding_dim+(self.encoder_hidden_dim*2)+self.decoder_hidden_dim, self.embedding_size_spanish)
        
    def forward(self, target_lang, encoder_output, decoder_hidden):
                
        # input shape: [BATCH_SIZE]
        embeds = self.embedding(target_lang)
        # embeds shape: [BATCH_SIZE, EMBEDDING_DIM]
        
        # attention layer - get the context vector from the attention layer.
        # encoder_output: [sequence_len, batch_size, ENCODER_HIDDEN_DIM*2]
        # decoder_hidden: [batch_size, DECODER_HIDDEN_DIM]
        a = self.attention(encoder_output, decoder_hidden)
        # attention shape: [BATCH_SIZE, 1, ENCODER_HIDDEN_DIM*2]
        
        ### concatenate convext vector, "embeds" and attention output, "a".
        
        # expand 1 dim for embeds
        #input shape: [BATCH_SIZE, EMBEDDING_DIM]
        embeds = embeds.unsqueeze(0)
        #output shape: [1, BATCH_SIZE, EMBEDDING_DIM]

        # change the shape of context vector.
        # input shape:  [BATCH_SIZE, 1, ENCODER_HIDDEN_DIM*2]
        a = a.permute(1, 0, 2)
        # output shape: [1, BATCH_SIZE, ENCODER_HIDDEN_DIM*2]
            
        x = torch.cat((embeds, a), dim=2)
        # output shape: [1, BATCH_SIZE, ENCODER_HIDDEN_DIM*2+EMBEDDING_DIM] i.e. [1, 16, 2030]
        
        # input shape: [1, BATCH_SIZE, ENCODER_HIDDEN_DIM*2+EMBEDDING_DIM]
        rnn_dec, decoder_hidden = self.rnn(x)
        #rnn_dec, decoder_hidden = self.rnn(x, decoder_hidden.unsqueeze(0))
        # rnn_dec shape: [1, BATCH_SIZE, DECODER_HIDDEN_DIM], *1 since it is not bidirectional.
        # hidden.shape: [1, BATCH_SIZE, DECODER_HIDDEN_DIM], hidden[0] = 1, since it is not bidirectional.
        
        # prediction
        # input to the nn: embeds, a, rnn_dec
        # embeds output shape:    [1, BATCH_SIZE, EMBEDDING_DIM]
        # attention output shape: [1, BATCH_SIZE, ENCODER_HIDDEN_DIM*2]
        # rnn_dec shape:          [1, BATCH_SIZE, DECODER_HIDDEN_DIM]
        
        c1=torch.cat((rnn_dec, a, embeds), dim=2)
        # c1 shape: [1, BATCH_SIZE, EMBEDDING_DIM+(ENCODER_HIDDEN_DIM*2)+DECODER_HIDDEN_DIM]
        
        pred = self.linear(c1).squeeze(0)
        print(pred.shape)
        # [1, BATCH_SIZE, EMBEDDING_SIZE_SPANISH]
        
        return pred, decoder_hidden.squeeze(0)

In [335]:
# encoder model
model = Encoder(EMBEDDING_DIM, EMBEDDING_SIZE_ENGLISH, ENCODER_HIDDEN_DIM, DECODER_HIDDEN_DIM)
model.to(device)

model_attn = Attention(DECODER_HIDDEN_DIM, ENCODER_HIDDEN_DIM)
model_attn.to(device)

model_decoder = Decoder(EMBEDDING_DIM, EMBEDDING_SIZE_SPANISH, ENCODER_HIDDEN_DIM, DECODER_HIDDEN_DIM)
model_decoder.to(device)

optimizer = optim.Adam(model.parameters(), lr = 0.001)
loss_function = nn.NLLLoss()

RuntimeError: CUDA out of memory. Tried to allocate 3.88 MiB (GPU 0; 4.00 GiB total capacity; 2.65 GiB already allocated; 2.99 MiB free; 290.81 MiB cached)

In [332]:
for epochs in range(EPOCHS):
    total_loss = 0
    model.to(device)
    for i in range(0, len(english_tokenized_tensor), BATCH_SIZE):
        english_batch = english_tokenized_tensor[i:i+BATCH_SIZE] 
        spanish_batch = spanish_tokenized_tensor[i:i+BATCH_SIZE] 

        en, es = padding(english_batch, spanish_batch) # shape : [sequence_length, batch_size]

        en = en.to(device)
        es = es.to(device)

        encoder_output, enc_hidden = model(en)
        
        decoder_hidden = enc_hidden    # need to change this somehow.
        
        #attention = model_attn(encoder_output, decoder_hidden)  # no need to menton here
        
        # feed the target sentence, one by one word
        for i in range(MAX_SENTENCE_LENGTH):
            
            # at each loop, 1st word of every 16 batches (sentences) will be fed into the decoder.
            
            output, decoder_hidden = model_decoder(es[i], encoder_output, decoder_hidden)

            #print(output.squeeze(0).argmax(1))
            
        #model_attn(encoder_output, decoder_hidden)

tensor([2839, 2839, 2839, 2839, 2568, 2839, 2839, 2839, 2839, 2839, 2839, 2839,
        2839, 2839, 2839, 2839], device='cuda:0')
tensor([1109, 2484,  297, 1666,  705, 2341, 2156, 2663,    7, 1109, 2516,  724,
        2478, 2663, 2505,  473], device='cuda:0')
tensor([ 728, 2236, 2019,  325,  315, 1865, 1877, 2156,    2, 2663, 2367,  902,
         315, 2156,  315, 1432], device='cuda:0')
tensor([ 315, 2622, 2198, 2198,  827, 2019, 2404,  902,    7,   60,  250,  707,
         415,  902, 2141,    7], device='cuda:0')
tensor([ 393,  315,  213,  315,  899, 1865,  902,  922,  315, 2754,    2, 2243,
        2243,  922, 2505,  325], device='cuda:0')
tensor([1245, 2828, 2040,  466,  250, 2141,  393,  922, 1283, 2141,   75,  902,
        2663,  922,    7, 1865], device='cuda:0')
tensor([ 312, 1245,   75, 1521, 2003,    7, 2341,  393, 1462,  250,  188,  466,
        1290,  393, 1290, 2379], device='cuda:0')
tensor([2839,  728, 2622,  315, 1836, 2754,  902,  466,  315, 2198, 1902, 2141,
         9

RuntimeError: CUDA out of memory. Tried to allocate 6.12 MiB (GPU 0; 4.00 GiB total capacity; 2.65 GiB already allocated; 2.99 MiB free; 290.57 MiB cached)

In [325]:
a = torch.randn(4, 3, 2)
a

tensor([[[ 0.7648, -0.3719],
         [ 0.7351,  0.9866],
         [ 1.1867, -1.3069]],

        [[-0.7972, -2.1307],
         [-0.5821,  0.9901],
         [-0.6949, -1.1628]],

        [[-0.7456,  1.2057],
         [ 0.8060, -1.8569],
         [-0.1082, -0.4142]],

        [[ 0.5383,  0.1486],
         [ 0.2636, -1.9168],
         [-1.3498,  0.6109]]])

In [326]:
torch.max(a, dim=0)

(tensor([[0.7648, 1.2057],
         [0.8060, 0.9901],
         [1.1867, 0.6109]]), tensor([[0, 2],
         [2, 1],
         [0, 3]]))