#Table of Contents

1. Import Libraries
2. Load Dataset
3. Create Field Objects
4. Data Preparation
  - Build Vocabulary
  - Create Dataloaders
  
5. Define Model Architecture
  - Encoder Architecture
  - Decoder Architecture
  - Sequence-to-Sequence Architecture
7. Train Sequence-to-Sequence Model
8. Model Inference
  - Build Inference Function
  - Translate Russian Sentences in the Test Dataset

In [1]:
!pip install torch==1.4.0

[31mERROR: Could not find a version that satisfies the requirement torch==1.4.0 (from versions: 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.4.0[0m[31m
[0m

In [2]:
!pip install torchtext==0.4.0

Collecting torchtext==0.4.0
  Downloading torchtext-0.4.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m466.1 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->torchtext==0.4.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->torchtext==0.4.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->torchtext==0.4.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->torchtext==0.4.0)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->torchtext==0.4.0)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410

#1. Import Libraries

In [3]:
import re
import time
import math
import random

import numpy as np
import pandas as pd
import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data

from tqdm import notebook
pd.set_option('display.max_colwidth', 200)

In [4]:
# check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


#2. Load Dataset

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# extract the zip file from your Google Drive
!unzip /content/drive/MyDrive/Courses/Natural_Language_Processing_NLP_Using_Deep_Learning/Project_Translating_Text_from_Russian_to_English/nmt_data.zip

Archive:  /content/drive/MyDrive/Courses/Natural_Language_Processing_NLP_Using_Deep_Learning/Project_Translating_Text_from_Russian_to_English/nmt_data.zip
  inflating: nmt_data_test.csv       
  inflating: nmt_data.csv            


In [7]:
# read dataset from the Google drive
df = pd.read_csv("nmt_data.csv")
test_df = pd.read_csv("nmt_data_test.csv")

# shape of datasets
df.shape, test_df.shape

((187053, 2), (46668, 2))

In [8]:
df.sample(10)

Unnamed: 0,rus,eng
127816,том порезал палец осколком стекла,tom cut his finger on a piece of glass
113617,насколько большой ящик вам нужен,how big a box do you need
126219,обращайтесь,you're welcome
68845,все пришли кроме тебя,everyone had come except you
172296,дайте мне половину,give me half
100281,я пил сок,i drank juice
18083,я должен начать готовиться,i have to start getting ready
181998,студенты должны больше учиться,the students ought to study more
17689,дни становятся длиннее,the days are growing longer
88522,я бы хотел заплатить наличными,i'd like to pay in cash


In [9]:
test_df.sample(10)

Unnamed: 0,rus,eng
15679,вы выглядите уставшим,you look tired
45020,не знаю чего я ждал,i don't know what i expected
45303,я сказал тому остаться дома,i told tom to stay at home
16004,этот художник умер молодым,this artist died young
22920,том склонен опаздывать,tom is apt to be late
39108,они меня подозревают,do they suspect me
31723,ладно пока,well see you later
42421,я научусь,i will learn
5657,что именно вы думаете,what exactly are you thinking
5955,вы тома случайно не знаете,do you know tom by any chance


#3. Create Field Objects

In [10]:
# import Russian spacy model to tokenize Russian text
from spacy.lang.ru import Russian

In [11]:
# dependency for spaCy Russian tokenizer
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dawg-python>=0.7.1 (from pymorphy2)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4 (from pymorphy2)
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docopt>=0.6 (from pymorphy2)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docopt
  Building wheel for docopt (setup.py) ... [?25l[?25hdone
  Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl

In [12]:
# spacy object for Russian
nlp_ru = Russian()

# spacy object for English
nlp_en = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])

In [13]:
## functions to perform tokenization

# tokenizes Russian text from a string into a list of tokens
def tokenize_ru(text):
  return [tok.text for tok in nlp_ru.tokenizer(text)]

# tokenizes English text from a string into a list of tokens
def tokenize_en(text):
  return [tok.text for tok in nlp_en.tokenizer(text)]

In [15]:
## Create Field objects

# Field object for Russian
SRC = data.Field(tokenize = tokenize_ru,
                 include_lengths = True,
                 lower = True)

# Field object for English
TRG = data.Field(tokenize = tokenize_en,
                 init_token = '<sos>', # "start" token
                 eos_token = '<eos>', # "" token
                 include_lengths = True,
                 lower = True)

fields = [('rus', SRC), ('eng', TRG)]

* refer the video "Text preprocessing in PyTorch" in the course "Fundamentals of Deep Learning" to learn more about the TorchText's Field objects

#4. Data Preparation

###4.1 Build Vocabulary


In [None]:
# importing data from csv
nmt_data = data.TabularDataset(path="nmt_data.csv", format='csv', fields=fields)

In [None]:
# build vocabulary for Russian sequences
SRC.build_vocab(nmt_data, max_size=4000)

# build vocabulary for English sequences
TRG.build_vocab(nmt_data, max_size=4000)

In [None]:
# check size of vocabulary
len(SRC.vocab), len(TRG.vocab)

(4002, 4004)

In [None]:
# special tokens in input sequences (Russian)
SRC.vocab.itos[0], SRC.vocab.itos[1]

('<unk>', '<pad>')

In [None]:
# special tokens in ouput sequences (English)
TRG.vocab.itos[0], TRG.vocab.itos[1], TRG.vocab.itos[2], TRG.vocab.itos[3]

('<unk>', '<pad>', '<sos>', '<eos>')

###4.2 Create Dataloaders

In [None]:
# Split our dialogue data into training, validation, and test sets
train_data, val_data = nmt_data.split(split_ratio=0.8)

In [None]:
# Create a set of iterators for each split
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data),
    batch_size = 64,
    sort_within_batch = True,
    sort_key = lambda x:len(x.rus),
    device = device)

#5. Define Model Architecture

###5.1 Encoder Architecture

In [None]:
## embedding layer:
##    input dimensions = input_dim (size of Russian vocabulary),
##    ouput dimensions = emb_dim

## GRU layer:
##    input dimensions = emb_dim
##    hidden units = hid_dim
##    layers = n_layers
##    output dim = hid_dim

class Encoder(nn.Module):
  def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()

    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(input_dim, emb_dim)

    self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout)

  def forward(self, src):

    #src: [src len, batch size]

    embedded = self.embedding(src)

    #embedded: [src len, batch size, emb dim]

    outputs, hidden = self.gru(embedded)

    #outputs: [src len, batch size, hid dim]
    #hidden: [n layers, batch size, hid dim]

    return hidden

###5.2 Decoder Architecture

In [None]:
## embedding layer:
##    input dimensions = output_dim (size of English vocabulary),
##    ouput dimensions = emb_dim

## GRU layer:
##    input dimensions = emb_dim
##    hidden units = hid_dim
##    layers = n_layers
##    output dim = hid_dim

## Fully Connected layer:
##    input dimensions = hid_dim,
##    ouput dimensions = output_dim (size of English vocabulary)

class Decoder(nn.Module):
  def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
    super().__init__()

    self.output_dim = output_dim
    self.hid_dim = hid_dim
    self.n_layers = n_layers

    self.embedding = nn.Embedding(output_dim, emb_dim)

    self.gru = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout)

    self.fc_out = nn.Linear(hid_dim, output_dim)

  def forward(self, input, hidden):

    input = input.unsqueeze(0)

    #input = [1, batch size]

    embedded = self.embedding(input)

    #embedded = [1, batch size, emb dim]

    output, hidden = self.gru(embedded, hidden)

    #output = [seq len, batch size, hid dim]
    #hidden = [n layers, batch size, hid dim]

    #seq len will always be 1 in the decoder, therefore, output = [1, batch size, hid dim]

    prediction = self.fc_out(output.squeeze(0))

    #prediction = [batch size, output dim]

    return prediction, hidden

###5.3 Sequence-to-Sequence Architecture

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.device = device

  def forward(self, src, trg):

    #src = [src len, batch size]
    #trg = [trg len, batch size]

    batch_size = trg.shape[1]
    trg_len = trg.shape[0]
    trg_vocab_size = self.decoder.output_dim

    # tensor to store decoder outputs
    outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

    # last hidden state of the encoder is used as the initial hidden state of the decoder
    hidden = self.encoder(src)

    # first input to the decoder is the <sos> tokens
    input = trg[0,:]

    for t in range(1, trg_len):

      # insert input token embedding, previous hidden state
      # receive output tensor (predictions) and new hidden state
      output, hidden = self.decoder(input, hidden)

      # place predictions in a tensor holding predictions for each token
      outputs[t] = output
      input = trg[t,:]

    return outputs

#6. Train Seq2Seq Model

In [None]:
# set hyperparameters
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.3
DEC_DROPOUT = 0.3

# instantiate Encoder and Decoder
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

# instantiate Sequence-to-Sequence Model
model = Seq2Seq(enc, dec, device).to(device)

In [None]:
# print model architecture
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(4002, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(4004, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
    (fc_out): Linear(in_features=256, out_features=4004, bias=True)
  )
)

In [None]:
# find number of trainable parameters
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 3,169,020 trainable parameters


In [None]:
# Adam optimizer
optimizer = optim.Adam(model.parameters())

# pad token index
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

# cross entropy loss with softmax
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [None]:
def train(model, iterator, optimizer, criterion):

  model.train()

  epoch_loss = 0

  for i, batch in notebook.tqdm(enumerate(iterator)):

    # set accumulated loss to zero
    optimizer.zero_grad()

    # get integer sequences (tensors)
    src = batch.rus[0]
    trg = batch.eng[0]

    # pass Russian tensor batch to the sequence-to-sequence model
    output = model(src, trg)

    #trg = [trg len, batch size]
    #output = [trg len, batch size, output dim]

    output_dim = output.shape[-1]

    output = output[1:].view(-1, output_dim)
    #output = [(trg len - 1) * batch size, output dim]

    trg = trg[1:].view(-1)
    #trg = [(trg len - 1) * batch size]

    # compute loss
    loss = criterion(output, trg)

    # backpropagate lossb
    loss.backward()

    # update weights
    optimizer.step()

    epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

  model.eval()

  epoch_loss = 0

  with torch.no_grad():

    for i, batch in enumerate(iterator):

      # get integer sequences (tensors)
      src = batch.rus[0]
      trg = batch.eng[0]

      output = model(src, trg)

      #trg = [trg len, batch size]
      #output = [trg len, batch size, output dim]

      output_dim = output.shape[-1]

      output = output[1:].view(-1, output_dim)
      trg = trg[1:].view(-1)

      #trg = [(trg len - 1) * batch size]
      #output = [(trg len - 1) * batch size, output dim]

      loss = criterion(output, trg)

      epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [None]:
# function to compute time taken by an epoch (in mm:ss)
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')

# start model training
for epoch in range(N_EPOCHS):

  start_time = time.time()

  train_loss = train(model, train_iterator, optimizer, criterion)
  valid_loss = evaluate(model, valid_iterator, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  # compare validation loss
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'best_model.pt')

  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 01 | Time: 1m 28s
	Train Loss: 2.928 | Train PPL:  18.683
	 Val. Loss: 2.070 |  Val. PPL:   7.925


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 02 | Time: 1m 28s
	Train Loss: 1.807 | Train PPL:   6.091
	 Val. Loss: 1.611 |  Val. PPL:   5.005


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 03 | Time: 1m 26s
	Train Loss: 1.448 | Train PPL:   4.254
	 Val. Loss: 1.429 |  Val. PPL:   4.173


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 04 | Time: 1m 28s
	Train Loss: 1.263 | Train PPL:   3.535
	 Val. Loss: 1.339 |  Val. PPL:   3.816


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 05 | Time: 1m 27s
	Train Loss: 1.145 | Train PPL:   3.143
	 Val. Loss: 1.286 |  Val. PPL:   3.619


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 06 | Time: 1m 27s
	Train Loss: 1.063 | Train PPL:   2.896
	 Val. Loss: 1.254 |  Val. PPL:   3.505


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 07 | Time: 1m 26s
	Train Loss: 1.000 | Train PPL:   2.720
	 Val. Loss: 1.237 |  Val. PPL:   3.444


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 08 | Time: 1m 27s
	Train Loss: 0.952 | Train PPL:   2.590
	 Val. Loss: 1.225 |  Val. PPL:   3.403


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 09 | Time: 1m 25s
	Train Loss: 0.914 | Train PPL:   2.494
	 Val. Loss: 1.219 |  Val. PPL:   3.385


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


Epoch: 10 | Time: 1m 23s
	Train Loss: 0.882 | Train PPL:   2.415
	 Val. Loss: 1.215 |  Val. PPL:   3.369


#7. Model Inference

In [None]:
# load saved model weights
path = 'best_model.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

###7.1 Build Inference Function

In [None]:
# function to perform translation
def translate_sentence(sentence, model):

  # set model at evaluation modeb
  model.eval()

  # empty list to keep input sequence tokensb
  token_int = []

  # iterate over the input sequence
  doc = nlp_ru(sentence)
  for i in doc:
    # convert tokens to
    token_int.append(SRC.vocab.stoi[i.text])

  # convert list to a PyTorch tensor
  token_int = torch.tensor([token_int]).to(device)
  token_int = token_int.reshape(-1,1)

  # pass the tensor to the encoder and get the context vector (hidden)
  hidden = model.encoder(token_int)

  # initialize the list with the start token's index
  trg_indexes = [TRG.vocab.stoi[TRG.init_token]]

  pred_token = TRG.vocab.stoi[TRG.init_token]

  while pred_token != TRG.vocab.stoi[TRG.eos_token]:
    trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
    with torch.no_grad():
      # pass the context vector (hidden) to the decoder
      output, hidden = model.decoder(trg_tensor, hidden)

    # get index of the largest value
    pred_token = output.argmax(1).item()
    trg_indexes.append(pred_token)

  # covert integers to tokens
  trg_tokens = [TRG.vocab.itos[i] for i in trg_indexes]

  return " ".join(trg_tokens[1:-1])

In [None]:
# actual translation "is it working"
sent = "это работает"
translate_sentence(sent, model)

'it works'

###7.2 Translate Russian Sentences in the Test Dataset

In [None]:
translations = [translate_sentence(sent, model) for sent in notebook.tqdm(test_df["rus"])]

HBox(children=(FloatProgress(value=0.0, max=46668.0), HTML(value='')))




In [None]:
# add translations to the test dataframe
test_df["translations"] = translations

In [None]:
test_df.sample(20)

Unnamed: 0,rus,eng,translations
42227,это совершенно секретно,this is top secret,this is totally <unk>
40378,он принял решение повторить попытку,he made up his mind to try again,he <unk> the importance of <unk>
37795,это была долгая неделя,this has been a long week,it was a long week
42201,том бы тобой очень гордился,tom would be very proud of you,tom would 've been very proud of you
13958,я с трудом нашёл тома,i had a hard time finding tom,i 've met tom 's cat
38281,это очень необычно,this is really unusual,this is very thin
41192,я рада что ты согласна,i'm glad you agree,i 'm glad you agree
26739,смех заразителен,laughter is infectious,the <unk> is <unk>
32373,мы провели день на пляже,we spent the day at the beach,we took a meeting at all
26020,почему ты не здесь,why aren't you here,why are n't you here


In [None]:
# save translations and download it to your local system
test_df.to_csv("nmt_test_translations.csv", index=False)