In [None]:
#Data Preparation and pre-processing


!pip install torchtext==0.6.0 --quiet
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import pandas as pd
import spacy, random

## Loading the SpaCy's vocabulary for our desired languages.
!python -m spacy download en_core_web_sm --quiet
!python -m spacy download de_core_news_sm --quiet

spacy_german = spacy.load("de_core_news_sm")
spacy_english = spacy.load("en_core_web_sm")

def tokenize_german(text):
    return [token.text for token in spacy_german.tokenizer(text)]

def tokenize_english(text):
    return [token.text for token in spacy_english.tokenizer(text)]

german = Field(tokenize=tokenize_german, lower=True,
               init_token="<sos>", eos_token="<eos>")

english = Field(tokenize=tokenize_english, lower=True,
               init_token="<sos>", eos_token="<eos>")

train_data, valid_data, test_data = Multi30k.splits(exts = (".de", ".en"),
                                                    fields=(german, english))

german.build_vocab(train_data, max_size=10000, min_freq=3)
english.build_vocab(train_data, max_size=10000, min_freq=3)

print(f"Unique tokens in source (de) vocabulary: {len(german.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(english.vocab)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data),
                                                                      batch_size = BATCH_SIZE,
                                                                      sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.src),
                                                                      device = device)

max_len_ger=[]
max_len_eng=[]
count=0

# Dataset Sneek peek before tokenizing
for data in train_data:
    max_len_ger.append(len(data.src))
    max_len_eng.append(len(data.trg))
    if count < 10 :
        print("German - ",*data.src, " Length - ", len(data.src))
        print("English - ",*data.trg, " Length - ", len(data.trg))
        print()
    count += 1

print("Maximum Length of English Sentence {} and German Sentence {} in the dataset".format(max(max_len_eng),max(max_len_ger)))
print("Minimum Length of English Sentence {} and German Sentence {} in the dataset".format(min(max_len_eng),min(max_len_ger)))

#Encoder code implementation

class EncoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(EncoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
        self.input_size = input_size

    # Output size of the word embedding NN
        self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
        self.hidden_size = hidden_size

    # Number of layers in the lstm
        self.num_layers = num_layers

    # Regularization parameter
        self.dropout = nn.Dropout(p)
        self.tag = True

    # Shape --------------------> (5376, 300) [input size, embedding dims]
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)

    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
        self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout = p)

  # Shape of x (26, 32) [Sequence_length, batch_size]
    def forward(self, x):

    # Shape -----------> (26, 32, 300) [Sequence_length , batch_size , embedding dims]
        embedding = self.dropout(self.embedding(x))

    # Shape --> outputs (26, 32, 1024) [Sequence_length , batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size]
        outputs, (hidden_state, cell_state) = self.LSTM(embedding)

        return hidden_state, cell_state

input_size_encoder = len(german.vocab)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = float(0.5)

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

#Decoder code implementation

class DecoderLSTM(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
        super(DecoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
        self.input_size = input_size

    # Output size of the word embedding NN
        self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
        self.hidden_size = hidden_size

    # Number of layers in the lstm
        self.num_layers = num_layers

    # Size of the one hot vectors that will be the output to the encoder (English Vocab Size)
        self.output_size = output_size

    # Regularization parameter
        self.dropout = nn.Dropout(p)
        self.tag = True

    # Shape --------------------> (5376, 300) [input size, embedding dims]
        self.embedding = nn.Embedding(self.input_size, self.embedding_size)

    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
        self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout = p)

    # Shape -----------> (1024, 4556) [embedding dims, hidden size, num layers]
        self.fc = nn.Linear(self.hidden_size, self.output_size)

  # Shape of x (32) [batch_size]
    def forward(self, x, hidden_state, cell_state):

    # Shape of x (1, 32) [1, batch_size]
         x = x.unsqueeze(0)

    # Shape -----------> (1, 32, 300) [1, batch_size, embedding dims]
         embedding = self.dropout(self.embedding(x))

    # Shape --> outputs (1, 32, 1024) [1, batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size] (passing encoder's hs, cs - context vectors)
         outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))

    # Shape --> predictions (1, 32, 4556) [ 1, batch_size , output_size]
         predictions = self.fc(outputs)

    # Shape --> predictions (32, 4556) [batch_size , output_size]
         predictions = predictions.squeeze(0)

         return predictions, hidden_state, cell_state

input_size_decoder = len(english.vocab)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = float(0.5)
output_size = len(english.vocab)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)

#Seq2Seq (encoder-decoder code implementation)

class Seq2Seq(nn.Module):
    def __init__(self, Encoder_LSTM, Decoder_LSTM):
        super(Seq2Seq, self).__init__()
        self.Encoder_LSTM = encoder_lstm
        self.Decoder_LSTM = decoder_lstm

    def forward(self, source, target, tfr=0.5):
    # Shape - Source : (10, 32) [(Sentence length German + some padding), Number of Sentences]
        batch_size = source.shape[1]

    # Shape - Source : (14, 32) [(Sentence length English + some padding), Number of Sentences]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

    # Shape --> outputs (14, 32, 5766)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    # Shape --> (hs, cs) (2, 32, 1024) ,(2, 32, 1024) [num_layers, batch_size size, hidden_size] (contains encoder's hs, cs - context vectors)
        hidden_state_encoder, cell_state_encoder = self.Encoder_LSTM(source)

    # Shape of x (32 elements)
        x = target[0] # Trigger token <SOS>

        for i in range(1, target_len):
      # Shape --> output (32, 5766)
            output, hidden_state_decoder, cell_state_decoder = self.Decoder_LSTM(x, hidden_state_encoder, cell_state_encoder)
            outputs[i] = output
            best_guess = output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
            x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

    # Shape --> outputs (14, 32, 5766)
        return outputs

model=Seq2Seq(encoder_lstm, decoder_lstm).to(device)

print(model)

#Seq2Seq model training
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
import torch.optim as optim
#from torch.utils import translate_sentence, bleu, load_checkpoint, save_checkpoint

load_model=False
epoch_loss = 0.0
num_epochs = 1
best_loss = 999999
best_epoch = -1
sentence1 = "ein mann in einem blauen hemd steht auf einer leiter und putzt ein fenster"
ts1 = []
step=0
learning_rate=0.001
criterion=nn.CrossEntropyLoss()
model=Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer=optim.Adam(model.parameters(), lr=learning_rate)

#if load_model:
  #load_checkpoint(torch.load('my_checkpoint.pth.ptar', model, optimizer))


for epoch in range(num_epochs):
    print("Epoch - {} / {}".format(epoch+1, num_epochs))
    model.eval()
    #translated_sentence1 = translate_sentence(model, sentence1, german, english, device, max_length=50)
    #print(f"Translated example sentence 1: \n {translated_sentence1}")
    #ts1.append(translated_sentence1)

    model.train(True)
    for batch_idx, batch in enumerate(train_iterator):
        input = batch.src.to(device)
        target = batch.trg.to(device)

    # Pass the input and target for model's forward method
        output = model(input, target)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

    # Clear the accumulating gradients
        optimizer.zero_grad()

    # Calculate the loss value for every epoch
        loss = criterion(output, target)

    # Calculate the gradients for weights & biases using back-propagation
        loss.backward()

    # Clip the gradient value is it exceeds > 1
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using back propagation
        optimizer.step()
        step += 1
        epoch_loss += loss.item()

    if epoch_loss < best_loss:
        best_loss = epoch_loss
        best_epoch = epoch
        #checkpoint={'state_dict':model.state_dict(), 'optimizer':optimizer.state_dict()}
        #save_checkpoint(checkpoint)
        if ((epoch - best_epoch) >= 10):
            print("no improvement in 10 epochs, break")
            break
    print("Epoch_Loss - {}".format(loss.item()))
    print()

print(epoch_loss / len(train_iterator))

#score = bleu(test_data[1:100], model, german, english, device)
#print(f"Bleu score {score*100:.2f}")




[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m61.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
Unique tokens in source (de) vocabulary: 5374
Unique tokens in target (en) vocabulary: 4556
German -  zwei junge weiße männer sind im freien in der nähe vieler büsche .  Length -  13
English -  two young , white males are outside near many bushes .  Length -  11

German -  mehrere männer mit schutzhelmen bedienen ein antriebsradsystem .  Length -  8
English -  several men in hard hats are operating a giant pulley system .  Length -  12

German -  ein kleines mädchen klettert in ein spielhaus aus holz .  Length -