In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")


In [2]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "<PAD>", 1: "SOS", 2:"<EOS>"}
        self.n_words = 3 

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [3]:

# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()


In [4]:

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs_train_train_train, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [5]:
input_lang, output_lang, pairs = readLangs('eng', 'ita', False)
print(pairs[:10])

Reading lines...
[['hi', 'ciao !'], ['run !', 'corri !'], ['run !', 'corra !'], ['run !', 'correte !'], ['who ?', 'chi ?'], ['wow !', 'wow !'], ['jump !', 'salta !'], ['jump !', 'salti !'], ['jump !', 'saltate !'], ['jump', 'salta']]


In [6]:
MAX_LENGTH_lang1 = max([len(pair[0].split(' ')) for pair in pairs])
MAX_LENGTH_lang2 = max([len(pair[1].split(' ')) for pair in pairs])
print(f"The maximum length of the sentences in the dataset is {MAX_LENGTH_lang1} for the first language and {MAX_LENGTH_lang2} for the second language")


The maximum length of the sentences in the dataset is 101 for the first language and 94 for the second language


In [7]:

#Max length would be set to the maximum length of any sentence in the dataset
MAX_LENGTH = max(MAX_LENGTH_lang1, MAX_LENGTH_lang2) +1

#below theres a trimming for shorter training time, but for full dataset this isnt used

#If you want to trim also max length, uncomment the following line
MAX_LENGTH = 8


def filterPairs(pairs):
    return [pair for pair in pairs if len(pair[0].split(' ')) < MAX_LENGTH and len(pair[1].split(' ')) < MAX_LENGTH ]


In [8]:

def prepareData(lang1, lang2, reverse=False):
    '''
    -Read text file and split into lines, split lines into pairs
    -Normalize text, filter by length 
    -Make word lists from sentences in pairs
    '''
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    #Comment out the following line for full dataset
    pairs = filterPairs(pairs)
    print(f"Trimmed to {len(pairs)} sentence pairs")
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'ita', True)
print(random.choice(pairs))

Reading lines...
Read 345244 sentence pairs
Trimmed to 253276 sentence pairs
Counting words...
Counted words:
ita 22285
eng 11042
['ho mostrato il mio biglietto alla porta', 'i showed my ticket at the door']


In [9]:

class EncoderRNN(nn.Module):
    '''
    The EncoderRNN class is responsible for the encoder part of the seq2seq model.
    ''' 
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        #Embedding layer
        self.embedding = nn.Embedding(input_size, hidden_size)
        #GRU layer
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=False)
        #Dropout layer
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        #Embedding layer with dropout
        embedded = self.dropout(self.embedding(input))
        #Get the output and hidden state from the GRU
        output, hidden = self.gru(embedded)
        return output, hidden

In [10]:
#Example of usage for EncoderRNN
hidden_size = 256
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
input_tensor = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8]]).to(device)
output, hidden = encoder(input_tensor)
print(f"Output shape: {output.shape}") #Output shape is (1, 8, hidden_size) [BxSxH]
print(f"Output tensor: {output}")
print(f"Hidden shape: {hidden.shape}") #Hidden shape is (1, 1, hidden_size) [BxSxH]
print(f"Hidden tensor: {hidden}")


Output shape: torch.Size([1, 8, 256])
Output tensor: tensor([[[ 0.3049, -0.1757,  0.1206,  ...,  0.2703,  0.6308, -0.0399],
         [-0.4789,  0.2560, -0.1512,  ...,  0.0021,  0.2494, -0.3869],
         [-0.0905,  0.1449, -0.4662,  ...,  0.2685, -0.0764,  0.1012],
         ...,
         [-0.0696,  0.0573, -0.3923,  ...,  0.0088, -0.2134,  0.5970],
         [-0.2709,  0.0632, -0.2513,  ...,  0.0079, -0.2100, -0.1275],
         [-0.0375, -0.1229,  0.5289,  ...,  0.2515, -0.2660, -0.0690]]],
       device='cuda:1', grad_fn=<CudnnRnnBackward0>)
Hidden shape: torch.Size([1, 8, 256])
Hidden tensor: tensor([[[ 0.3049, -0.1757,  0.1206,  ...,  0.2703,  0.6308, -0.0399],
         [-0.4789,  0.2560, -0.1512,  ...,  0.0021,  0.2494, -0.3869],
         [-0.0905,  0.1449, -0.4662,  ...,  0.2685, -0.0764,  0.1012],
         ...,
         [-0.0696,  0.0573, -0.3923,  ...,  0.0088, -0.2134,  0.5970],
         [-0.2709,  0.0632, -0.2513,  ...,  0.0079, -0.2100, -0.1275],
         [-0.0375, -0.1229,  0

In [63]:
# Example of usage for EncoderRNN
output, hidden = encoder(input_tensor)
print(f"Encoder Output shape: {output.shape}")
print(f"Encoder Hidden shape: {hidden.shape}")


Encoder Output shape: torch.Size([1, 8, 128])
Encoder Hidden shape: torch.Size([1, 8, 128])


In [64]:

class Attention(nn.Module):
    '''
    Simple attention mechanism using dot product
    '''
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        print(f"Query shape before any operations: {query.shape}")  # Debug
        print(f"Keys shape before any operations: {keys.shape}")  # Debug

        # Ensure the query is [batch_size, 1, hidden_size]
        query = query.permute(1, 0, 2)  # Change to [batch_size, num_layers, hidden_size]
        query = query.mean(dim=1, keepdim=True)  # Optional: Reduce across layers if necessary, resulting in [batch_size, 1, hidden_size]

        # Transform query and keys
        transformed_query = self.Wa(query)  # Shape: [batch_size, 1, hidden_size]
        transformed_keys = self.Ua(keys)    # Shape: [batch_size, seq_len, hidden_size]
        print(f"Transformed Query shape: {transformed_query.shape}")
        print(f"Transformed Keys shape: {transformed_keys.shape}")
        # This is where your existing prints are helpful too

        # Since we're adding, shapes are:
        # transformed_query: [batch_size, 1, hidden_size]
        # transformed_keys: [batch_size, seq_len, hidden_size]
        # Broadcasting takes care of matching dimensions

        # Combined: Add transformed query and keys
        combined = torch.tanh(transformed_query + transformed_keys)  # Broadcasting happens here

        # Compute scores and weights
        scores = self.Va(combined).squeeze(-1)  # Shape becomes [batch_size, seq_len]
        weights = F.softmax(scores, dim=-1).unsqueeze(1)  # Shape: [batch_size, 1, seq_len]

        # Compute context vector
        context = torch.bmm(weights, keys)  # Shape: [batch_size, 1, hidden_size]

        return context, weights





In [65]:
#Check attention works
hidden_size = 256
attention = Attention(hidden_size).to(device)
query = torch.randn(1, 8, hidden_size).to(device) #shape: (1, 1, hidden_size) where hidden_size is 256 and 1 is the batch size and 1 is the sequence length
keys = torch.randn(1, 8, hidden_size).to(device) #shape: (1, 8, hidden_size) where hidden_size is 256 and 1 is the batch size and 8 is the sequence length
context, weights = attention(query, keys)
print(f"Context shape: {context.shape}") #Context shape is (1, 1, hidden_size) [BxSxH]
print(f"Context tensor: {context}")
print(f"Weights shape: {weights.shape}") #Weights shape is (1, 1, 8) [BxSxL]
print(f"Weights tensor: {weights}")




Query shape before any operations: torch.Size([1, 8, 256])
Keys shape before any operations: torch.Size([1, 8, 256])
Transformed Query shape: torch.Size([8, 1, 256])
Transformed Keys shape: torch.Size([1, 8, 256])


RuntimeError: Expected size for first two dimensions of batch2 tensor to be: [8, 8] but got: [1, 8].

In [52]:
'''
class AttnDecoderRNN(nn.Module):
    """
    The AttnDecoderRNN class is responsible for the decoder part of the seq2seq model.
    This decoder has an attention mechanism.
    """
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = Attention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            # Teacher forcing: Feed the target as the next input with some probability
            if target_tensor is not None and random.random() < 0.5:
                    decoder_input = target_tensor[:, i].unsqueeze(1)# Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attention_weights = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attention_weights


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights
'''
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = Attention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.tensor([[SOS_token] * batch_size], device=device).transpose(0, 1)
        decoder_hidden = encoder_hidden[-1].unsqueeze(0)  # Taking the last layer's hidden state

        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None and random.random() < 0.5:
                decoder_input = target_tensor[:, i].unsqueeze(1)  # Teacher forcing
            else:
                topv, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # Next input is decoder's own current output

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions

    def forward_step(self, input, hidden, encoder_outputs):
        # Ensure `input` is correctly shaped for embedding
        if input.dim() == 1:
            input = input.unsqueeze(1)  # Reshape to [batch_size, 1] if necessary
        
        embedded = self.dropout(self.embedding(input))  # [batch_size, 1, hidden_size]
        
        # Attention query preparation: Use the last hidden state for query
        # Assuming `hidden` shape is [num_layers, batch_size, hidden_size] for GRUs/LSTMs
        query = hidden[-1].unsqueeze(0)  # [1, batch_size, hidden_size], taking the last layer's hidden state
        query = query.permute(1, 0, 2)  # Reshape to [batch_size, 1, hidden_size] for attention
        
        # Context calculation
        context, attn_weights = self.attention(query, encoder_outputs)  # encoder_outputs is [batch_size, seq_len, hidden_size]
        
        # Concatenation correction: Ensure both are [batch_size, 1, hidden_size]
        # `context` should already be [batch_size, 1, hidden_size] if coming from your corrected attention forward
        input_gru = torch.cat((embedded, context), dim=2)  # Correct concatenation
        
        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output.squeeze(1))  # Assuming batch_first=True, squeeze seq_len dimension which is 1

        return output, hidden, attn_weights


    '''def forward_step(self, input, hidden, encoder_outputs):
        embedded = self.dropout(self.embedding(input))
        query = hidden
        context, attn_weights = self.attention(query, encoder_outputs)
        
        # Concatenate the embedded input and the context vector
        input_gru = torch.cat((embedded, context), dim=2)
        
        output, hidden = self.gru(input_gru, query)
        output = self.out(output.squeeze(dim=1))

        return output, hidden, attn_weights'''


In [53]:
# Mock input parameters
batch_size = 2  # Example batch size
seq_len = 10  # Example sequence length
vocab_size = output_lang.n_words  # Assuming output_lang.n_words gives the target vocab size

# Initialize the decoder
decoder = AttnDecoderRNN(hidden_size=256, output_size=vocab_size, dropout_p=0.1).to(device)

# Mock encoder outputs and hidden state
mock_encoder_outputs = torch.rand(batch_size, seq_len, 256, device=device)  # [batch_size, seq_len, hidden_size]
mock_encoder_hidden = torch.rand(1, batch_size, 256, device=device)  # [1, batch_size, hidden_size] for GRU/LSTM

# Optionally, mock target tensor for teacher forcing
# Here, assuming a random target sequence of length `seq_len`
mock_target_tensor = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len), device=device)

# Perform a forward pass through the decoder
# Note: The `mock_target_tensor` is optional; it's used here to demonstrate teacher forcing
output, hidden, attention = decoder(mock_encoder_outputs, mock_encoder_hidden, mock_target_tensor)

# Display output shapes and a sample output
print(f"Output shape: {output.shape}")  # Expected: [batch_size, seq_len, vocab_size] after log_softmax
print(f"Hidden shape: {hidden.shape}")  # Expected: [1, batch_size, hidden_size]
print(f"Attention shape: {attention.shape}")  # Expected: [batch_size, seq_len, seq_len] or similar, depending on implementation

# Note: This is a simplified test. Adjust `seq_len`, `batch_size`, and other parameters as needed to match your use case.


Query shape before any operations: torch.Size([2, 1, 256])
Keys shape before any operations: torch.Size([2, 10, 256])
Query shape before any operations: torch.Size([2, 1, 256])
Keys shape before any operations: torch.Size([2, 10, 256])
Query shape before any operations: torch.Size([2, 1, 256])
Keys shape before any operations: torch.Size([2, 10, 256])
Query shape before any operations: torch.Size([2, 1, 256])
Keys shape before any operations: torch.Size([2, 10, 256])
Query shape before any operations: torch.Size([2, 1, 256])
Keys shape before any operations: torch.Size([2, 10, 256])
Query shape before any operations: torch.Size([2, 1, 256])
Keys shape before any operations: torch.Size([2, 10, 256])
Query shape before any operations: torch.Size([2, 1, 256])
Keys shape before any operations: torch.Size([2, 10, 256])
Query shape before any operations: torch.Size([2, 1, 256])
Keys shape before any operations: torch.Size([2, 10, 256])
Output shape: torch.Size([2, 178280])
Hidden shape: torc

In [54]:

def indexesFromSentence(lang, sentence):
    '''
    Get the indexes of the words in the sentence by using the Lang class and its word2index dictionary
    '''
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [55]:
from sklearn.model_selection import train_test_split
def get_dataloader(batch_size):
    #Prepare the data for the training
    input_lang, output_lang, pairs = prepareData('eng', 'ita', False)
    pairs_train, pairs_test = train_test_split(pairs, test_size=0.1, random_state=42)
    
    n_train = len(pairs_train)
    n_test = len(pairs_test)
    
    train_input_ids = np.zeros((n_train, MAX_LENGTH), dtype=np.int32)
    train_target_ids = np.zeros((n_train, MAX_LENGTH), dtype=np.int32)
    
    test_input_ids = np.zeros((n_test, MAX_LENGTH), dtype=np.int32)
    test_target_ids = np.zeros((n_test, MAX_LENGTH), dtype=np.int32)
    
    for idx, (inp, tgt) in enumerate(pairs_train):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        train_input_ids[idx, :len(inp_ids)] = inp_ids
        train_target_ids[idx, :len(tgt_ids)] = tgt_ids
    
    for idx, (inp, tgt) in enumerate(pairs_test):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        test_input_ids[idx, :len(inp_ids)] = inp_ids
        test_target_ids[idx, :len(tgt_ids)] = tgt_ids


    train_data = TensorDataset(torch.LongTensor(train_input_ids).to(device),
                               torch.LongTensor(train_target_ids).to(device))

    test_data = TensorDataset(torch.LongTensor(train_input_ids).to(device),
                                torch.LongTensor(train_target_ids).to(device))
    
    train_sampler = RandomSampler(train_data)
    test_sampler = RandomSampler(test_data)
    
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader, test_dataloader, pairs_train, pairs_test

In [18]:
#Example of usage for get_dataloader
batch_size = 32
input_lang, output_lang, train_dataloader, test_dataloader, pairs_train, pairs_test = get_dataloader(batch_size)
print(f"Train dataloader size: {len(train_dataloader)}")
print(f"Test dataloader size: {len(test_dataloader)}")


Reading lines...
Read 345244 sentence pairs
Trimmed to 253276 sentence pairs
Counting words...
Counted words:
eng 11042
ita 22285
Train dataloader size: 7124
Test dataloader size: 7124


In [56]:
def train_epoch(encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, train_dataloader, test_dataloader, clip=1.0):
    
    #Set the models in training mode
    encoder.train()
    decoder.train()
    
    #Initialize the total loss
    total_train_loss = 0
    total_val_loss = 0
    
    # Iterate over the training dataloader
    for input_tensor, target_tensor in train_dataloader:
        
        #Zero the gradients of the optimizers
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        #Get the input and target lengths
        input_length = input_tensor.size(1)
        target_length = target_tensor.size(1)
        
        #Initialize the encoder hidden state and the encoder outputs to zero-tensors
        encoder_outputs = torch.zeros(input_length, encoder.hidden_size, device=device)
        encoder_hidden = torch.zeros(1, 1, encoder.hidden_size, device=device)
        
        #Zero the loss
        loss = 0

        #Get the encoder outputs and hidden state
        encoder_outputs, encoder_hidden = encoder(input_tensor) 
        decoder_input = torch.tensor([[SOS_token]], device=device, dtype=torch.float) #SOS token as the first input to the decoder
        decoder_hidden = encoder_hidden #Use the last hidden state of the encoder as the first hidden state of the decoder

        #Iterate over the target length for the decoder to get the outputs
        for di in range(target_length):
            #Get the decoder output and hidden state and the attention weights
            decoder_output, decoder_hidden, attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            #update the loss according to the criterion 
            loss += criterion(decoder_output, target_tensor[:, di])
            #The next input to the decoder is the target tensor at the current position
            decoder_input = target_tensor[:, di].unsqueeze(1)
        
        #Backward pass 
        loss.backward()
        #Clip the gradients to avoid exploding gradients problem
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
        
        #Update the parameters
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        #Add the loss to the total loss, as the loss is a tensor, we need to get the scalar value
        total_train_loss += loss.item()
    
    encoder.eval()
    decoder.eval()  
    # Iterate over the test dataloader
    for input_tensor, target_tensor in test_dataloader:
        
        #Get the input and target lengths
        input_length = input_tensor.size(1)
        target_length = target_tensor.size(1)
        
        #Initialize the encoder hidden state and the encoder outputs to zero-tensors
        encoder_outputs = torch.zeros(input_length, encoder.hidden_size, device=device)
        encoder_hidden = torch.zeros(1, 1, encoder.hidden_size, device=device)
        
        #Zero the loss
        loss = 0

        #Get the encoder outputs and hidden state
        encoder_outputs, encoder_hidden = encoder(input_tensor) 
        decoder_input = torch.tensor([[SOS_token]], device=device) #SOS token as the first input to the decoder
        decoder_hidden = encoder_hidden #Use the last hidden state of the encoder as the first hidden state of the decoder

        #Iterate over the target length for the decoder to get the outputs
        for di in range(target_length):
            #Get the decoder output and hidden state and the attention weights
            decoder_output, decoder_hidden, attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            #update the loss according to the criterion 
            loss += criterion(decoder_output, target_tensor[:, di])
            #The next input to the decoder is the target tensor at the current position
            decoder_input = target_tensor[:, di].unsqueeze(1)
        
        #Add the loss to the total loss, as the loss is a tensor, we need to get the scalar value
        total_val_loss += loss.item()
        
    #Return the average loss
    return total_train_loss / len(train_dataloader), total_val_loss / len(test_dataloader)

In [57]:

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.savefig('./loss_plots/loss_plot.png')

In [58]:
#Training function
from tqdm import tqdm

def train(encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, train_dataloader, n_epochs, test_dataloader, clip=1.0):
    plot_losses = []
    for epoch in tqdm(range(n_epochs)):
        train_loss, val_loss = train_epoch(encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, train_dataloader, test_dataloader, clip)
        plot_losses.append((train_loss, val_loss))
        print(f"Epoch {epoch+1}/{n_epochs} Train loss: {train_loss} Val loss: {val_loss}")
        #Save the model after each epoch
        torch.save(encoder.state_dict(), f'./models/encoder_epoch_{epoch+1}.pth')
        torch.save(decoder.state_dict(), f'./models/decoder_epoch_{epoch+1}.pth')
    showPlot(plot_losses)
    

In [59]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [60]:

def showAttention(input_sentence, output_words, attentions, plot_path):
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.cpu().numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.savefig(plot_path)
    plt.show()


def evaluateAndShowAttention(input_sentence, plot_path,encoder, decoder):
    output_words, attentions = evaluate(encoder, decoder, input_sentence, input_lang, output_lang)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions[0, :len(output_words), :], plot_path=plot_path)

In [24]:
#we need to get the test pairs in the same format as the train pairs
batch_size = 32
_, _, _, _, pairs_train, pairs_test = get_dataloader(batch_size)
print (pairs_test[0])
print (pairs_train[0])


Reading lines...
Read 345244 sentence pairs
Trimmed to 253276 sentence pairs
Counting words...
Counted words:
eng 11042
ita 22285
['i like your avatar', 'mi piace il vostro avatar']
['they hugged', 'si sono abbracciati']


In [61]:

def evaluateRandomly(encoder, decoder, n=20):
    for i in range(n):
        pair = random.choice(pairs_test) #pairs is the list of 
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')
        evaluateAndShowAttention(pair[0], f'./test_plots/plot_{i}.png',encoder, decoder)


In [62]:

hidden_size = 128
batch_size = 64
n_epochs = 1
input_lang, output_lang, train_dataloader, test_dataloader, pairs_train, pairs_test = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=0.001)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)
criterion = nn.NLLLoss(ignore_index=PAD_token)

train(encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, train_dataloader, n_epochs, test_dataloader, clip=1.0)
#Save the model
torch.save(encoder.state_dict(), 'encoder_short.pth')
torch.save(decoder.state_dict(), 'decoder_short.pth')


Reading lines...
Read 345244 sentence pairs
Trimmed to 253276 sentence pairs
Counting words...
Counted words:
eng 11042
ita 22285


  0%|          | 0/1 [00:00<?, ?it/s]


Query shape before any operations: torch.Size([8, 1, 128])
Keys shape before any operations: torch.Size([1, 1])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 128x128)

In [None]:
'''
#To continue training, load the model and continue training

hidden_size = 128
batch_size = 64

input_lang, output_lang, train_dataloader, test_dataloader, pairs_train, pairs_test = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

#encoder.load_state_dict(torch.load('encoder.pth'))
#decoder.load_state_dict(torch.load('decoder.pth'))

train(train_dataloader, encoder, decoder, n_epochs=1, print_every=5, plot_every=5)
'''

Reading lines...
Read 345244 sentence pairs
Trimmed to 345244 sentence pairs
Counting words...
Counted words:
ita 26170
eng 13069


KeyboardInterrupt: 

In [None]:
#Load the model
#encoder = encoder.load_state_dict(torch.load('encoder_short.pth'))
#encoder = decoder.load_state_dict(torch.load('decoder_short.pth'))


encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)
