## Lets test the model

In [1]:
!python -m spacy download en
!python -m spacy download de

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 14.7 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/opt/conda/lib/python3.7/site-packages/en_core_web_sm -->
/opt/conda/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')
Collecting de_core_news_sm==2.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 22.4 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l- \ | / - \ done
[?25h  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.3.0-py3-n

In [2]:
import torch
import torch.nn as nn
import random

# ---------------------------- ENCODER ----------------------------
class Encoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size, num_layers, drop_prob):
        """
        :param input_size: the size of the input sequence
        :param embedding_size: the embedding dimension
        :param hidden_size: the hidden dimension used in the LSTM model
        :param num_layers: number of layers in the LSTM model
        :param drop_prob: the probability of dropout
        """

        # self.param_dict = {
        #     'input_size' : input_size,
        #     'embedding_size' : embedding_size,
        #     'hidden_size' : hidden_size,
        #     'num_layers' : num_layers,
        #     'drop_prob' : drop_prob
        # }

        super(Encoder, self).__init__()

        self.dropout = nn.Dropout(drop_prob)  # for Regularization

        self.embedding = nn.Embedding(input_size, embedding_size)
        
        # the rnn cell
        self.rnn = nn.LSTM(input_size = embedding_size,
                        hidden_size = hidden_size,
                        num_layers = num_layers,
                        dropout=drop_prob,
                        batch_first=True
        )

    def forward(self, x):
        """
        :param x: the vector form of the sentence 
                  (containing the indicies mapped in the vocab)
        """

        # pass the data
        # N X T --> N X T X D
        x = self.dropout(self.embedding(x))

        output, (hidden_state, cell_state) = self.rnn(x)

        # return the context vectors
        # their shape : L X N X H (num_layers X batch_size X hidden_size)
        return hidden_state, cell_state




# ---------------------------- DECODER ----------------------------
class Decoder(nn.Module):

    def __init__(self, input_size, embedding_size, hidden_size, num_layers, drop_prob, output_size):
        """
        :param input_size: the size of the input sequence
        :param embedding_size: the embedding dimension
        :param hidden_size: the hidden dimension used in the LSTM model
        :param num_layers: number of layers in the LSTM model
        :param drop_prob: the probability of dropout
        :param output_size: the output size of the linear layer after the decoding
        """

        # self.param_dict = {
        #     'input_size' : input_size,
        #     'embedding_size' : embedding_size,
        #     'hidden_size' : hidden_size,
        #     'num_layers' : num_layers,
        #     'drop_prob' : drop_prob,
        #     'output_size' : output_size
        # }

        super(Decoder, self).__init__()

        self.dropout = nn.Dropout(drop_prob)  # for Regularization

        self.embedding = nn.Embedding(input_size, embedding_size)

        self.rnn = nn.LSTM(input_size=embedding_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            dropout=drop_prob,
                            # batch_first=True
        )

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden_state, cell_state):

        # unsqueeze x
        # shape becomes : 1 X N
        x = x.unsqueeze(0)

        # 1 X N --> 1 X N X D
        x = self.dropout(self.embedding(x))

        # shape of outputs : 1 X N X H (1 X batch_size X Hidden_size)
        # shape of hidden and cell states : L X N X H
        outputs, (hidden_state, cell_state) = self.rnn(x, (hidden_state, cell_state))

        # 1 X N X H --> 1 X N X output_size
        predictions = self.fc(outputs)

        # 1 X N X output_size --> N X output_size
        predictions = predictions.squeeze(0)

        return predictions, hidden_state, cell_state




# ---------------------------- SEQUENCE-TO-SEQUENCE ----------------------------
class Seq2Seq(nn.Module):

    def __init__(self, Encoder_LSTM, Decoder_LSTM):
        """
        :param Encoder_LSTM: the encoder part for the Seq2Seq model
        :param Decoder_LSTM: the decoder part for the Seq2Seq model
        """

        super(Seq2Seq, self).__init__()
        self.Encoder_LSTM = Encoder_LSTM
        self.Decoder_LSTM = Decoder_LSTM

    def forward(self, source, target, eng_vocab_size, tfr=0.5):
        """
        :param source: padded sentences in German
                       shape : [(sentence length German + some padding), #Sentences]
        :param target: padded sentences in English
                       shape : [(sentence length English + some padding), #Sentences]
        :param eng_vocab_size : size of the english vocab
        :param tfr: teach force ratio
        """

        # # Convert it into Batch Size X Sequence Length
        # target = target.permute(1, 0)

        batch_size = source.shape[0]
        target_len = target.shape[0]

        outputs = torch.zeros(target_len, batch_size, eng_vocab_size).to(device)

        # retaining the context vector from the encoder
        hidden_state, cell_state = self.Encoder_LSTM(source)

        x = target[0]

        for i in range(1, target_len):

            # output : batch_size X |Eng_Vocab_Size|
            output, hidden_state, cell_state = self.Decoder_LSTM(x, hidden_state, cell_state)

            outputs[i] = output

            best_guess = output.argmax(1)  # the most suitable word embedding

            # Teach force ratio
            # Either pass the next correct word from the dataset
            # or use the predicted word
            x = target[i] if random.random() < tfr else best_guess

        return outputs

In [3]:
# Basic utilities needed in the code

import torch
import spacy
from torchtext.data.metrics import bleu_score

def translate_sentence(model, sentence, german, english, device, max_length=50):
    """
    This function translates the input german sentence to the english sentence.
    German sentence --> German Vector --> Encoder --> context vector --> Decoder --> English Vector --> English Sentence

    :param model: the sequence-to-sequnce model
    :param sentence: the input "german" sentence
    :param german: the german Field object
    :param english : the english Field object
    :param device: cuda / cpu
    :param max_length : maximum length of the translated sentence
    """

    spacy_german = spacy.load("de")

    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_german(sentence)]

    else:
        tokens = [token.lower() for token in sentence]

    # insert the start and end sequence
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    text_to_indicies = [german.vocab.stoi[token] for token in tokens]

    # (N, ) --> (1 X N)
    sentence_tensor = torch.LongTensor(text_to_indicies).unsqueeze(0).to(device)

    # Retrieve the hidden_state and cell_state from the encoder
    with torch.no_grad():
        hidden_state, cell_state = model.Encoder_LSTM(sentence_tensor)

    # start the decoding part using start sequence and the (hidden_state, cell_state)
    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden_state, cell_state = model.Decoder_LSTM(previous_word, hidden_state, cell_state)

            # shape received : 1 X 1 X |Eng_Vocab|; squeeze it
            # output = output.squeeze(0)

            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model stops predicting if it predicts <eos> token (index)
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    # We have the indicies of the translated sentence in english
    # Now, we will predict the sentence
    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    return translated_sentence[1:]


In [4]:
import torch
from torchtext.data import Field, TabularDataset, BucketIterator
import spacy
import warnings
warnings.simplefilter('ignore')


def tokenize_german(text):
    """
    tokenizer for German language
    """
    return [token.text for token in spacy_german.tokenizer(text)]

def tokenize_english(text):
    """
    tokenizer for English language
    """
    return [token.text for token in spacy_english.tokenizer(text)]

if __name__ == '__main__':

    
    # tokenizers for German and English
    spacy_german = spacy.load("de")
    spacy_english = spacy.load("en")

    # Lets build the vocab
    german = Field(tokenize=tokenize_german,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>"
    )

    # Field Object for English
    english = Field(tokenize=tokenize_english,
                    lower=True,
                    init_token="<sos>",
                    eos_token="<eos>"
    )

    # dataset object
    dataset = TabularDataset(path="../input/german-to-english/dataset.csv",
                            format='csv',
                            skip_header=True,
                            fields=[('ger_sent', german), ('eng_sent', english)]
    )

    # BUILDING THE VOCAB
    german.build_vocab(dataset, max_size=10000, min_freq=3)
    english.build_vocab(dataset, max_size=10000, min_freq=3)

    GERMAN_VOCAB = german.vocab
    ENGLISH_VOCAB = english.vocab

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # load the model
    checkpoint = torch.load("../input/full-training-data-encoder-decoder-nmt/checkpoint-NMT-BEST.pth")
    my_model = checkpoint['model']


    # load the state dict of the model
    sd = torch.load("../input/full-training-data-encoder-decoder-nmt/checkpoint-NMT-BEST-SD.pth")
    my_model.load_state_dict(sd)

    sentence = "ein mann in einem blauen hemd steht auf einer leiter und putzt ein fenster"
    my_model.to(device)

    # Let's translate some sentences
    print(translate_sentence(my_model, sentence, german, english, device))

['a', 'man', 'in', 'a', 'blue', 'shirt', 'is', 'standing', 'on', 'a', 'ladder', 'cleaning', 'a', 'window', '.', '<eos>']


In [5]:
sentence = "Ein kleines Mädchen sitzt vor einem großen gemalten Regenbogen."
print(translate_sentence(my_model, sentence, german, english, device))

# GOOGLE : A little girl is sitting in front of a large painted rainbow.

['a', 'little', 'girl', 'is', 'sitting', 'in', 'front', 'of', 'a', 'large', 'painted', 'rainbow', '.', '<eos>']


In [6]:
sentence = "Eine große Menschenmenge steht außen vor dem Eingang einer Metrostation."
print(translate_sentence(my_model, sentence, german, english, device))

# GOOGLE : A large crowd stands outside the entrance of a metro station.

['a', 'large', 'crowd', 'of', 'people', 'stand', 'outside', 'of', 'the', '<unk>', 'while', 'wearing', 'front', 'of', 'a', 'metro', 'station', '.', '<eos>']
