#Table of Contents

1. Import Libraries
2. Load Dataset
3. Create Field Objects
4. Data Preparation
  - Build Vocabulary
  - Create Dataloaders
  
5. Define Model Architecture
  - Encoder Architecture
  - Attention Mechanism
  - Decoder Architecture
  - Sequence-to-Sequence Architecture
7. Train Sequence-to-Sequence Model
8. Model Inference
  - Build Inference Function
  - Translate Russian Sentences in the Test Dataset

In [1]:
!pip install torch==1.4.0

[31mERROR: Could not find a version that satisfies the requirement torch==1.4.0 (from versions: 1.11.0, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 2.0.0, 2.0.1, 2.1.0, 2.1.1, 2.1.2, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.3.1)[0m[31m
[0m[31mERROR: No matching distribution found for torch==1.4.0[0m[31m
[0m

In [17]:
!pip install torchtext==0.4.0

Collecting torchtext==0.4.0
  Using cached torchtext-0.4.0-py3-none-any.whl (53 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->torchtext==0.4.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->torchtext==0.4.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->torchtext==0.4.0)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->torchtext==0.4.0)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->torchtext==0.4.0)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->torchtext==0.4.0)
  Using cached nvidia_cufft_cu12-11.0.2.54-

#1. Import Libraries

In [1]:
import re
import time
import math
import random

import numpy as np
import pandas as pd
import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data

from tqdm import notebook
pd.set_option('display.max_colwidth', 200)

In [13]:

# check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


#2. Load Dataset

In [5]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# extract the zip file from your Google Drive
!unzip /content/drive/MyDrive/Courses/Natural_Language_Processing_NLP_Using_Deep_Learning/Project_Translating_Text_from_Russian_to_English/nmt_data.zip

Archive:  /content/drive/MyDrive/Courses/Natural_Language_Processing_NLP_Using_Deep_Learning/Project_Translating_Text_from_Russian_to_English/nmt_data.zip
  inflating: nmt_data_test.csv       
  inflating: nmt_data.csv            


#3. Create Field Objects

In [2]:

# import Russian spacy model to tokenize Russian text
from spacy.lang.ru import Russian

In [3]:
# dependency for spaCy Russian tokenizer
!pip install pymorphy2



In [4]:

# spacy object for Russian
nlp_ru = Russian()

# spacy object for English
nlp_en = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])

In [5]:

## functions to perform tokenization

# tokenizes Russian text from a string into a list of tokens
def tokenize_ru(text):
  return [tok.text for tok in nlp_ru.tokenizer(text)]

# tokenizes English text from a string into a list of tokens
def tokenize_en(text):
  return [tok.text for tok in nlp_en.tokenizer(text)]

In [7]:

## Create Field objects

# Field object for Russian
SRC = data.Field(tokenize = tokenize_ru,
                 include_lengths = True,
                 lower = True)

# Field object for English
TRG = data.Field(tokenize = tokenize_en,
                 init_token = '<sos>', # "start" token
                 eos_token = '<eos>', # "" token
                 include_lengths = True,
                 lower = True)

fields = [('rus', SRC), ('eng', TRG)]

* refer the video "Text preprocessing in PyTorch" in the course "Fundamentals of Deep Learning" to learn more about the TorchText's Field objects

#4. Data Preparation

###4.1 Build Vocabulary

In [8]:

# importing data from csv
nmt_data = data.TabularDataset(path="nmt_data.csv", format='csv', fields=fields)

In [9]:

# build vocabulary for Russian sequences
SRC.build_vocab(nmt_data, max_size=4000)

# build vocabulary for English sequences
TRG.build_vocab(nmt_data, max_size=4000)

In [10]:

# check size of vocabulary
len(SRC.vocab), len(TRG.vocab)

(4002, 4004)

###4.2 Create Dataloaders

In [11]:

# Split our dialogue data into training, validation, and test sets
train_data, val_data = nmt_data.split(split_ratio=0.8)

In [14]:

# Create a set of iterators for each split
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data),
    batch_size = 64,
    sort_within_batch = True,
    sort_key = lambda x:len(x.rus),
    device = device)

#5. Define Model Architecture

###5.1 Encoder Architecture

In [15]:


## embedding layer:
##    input dimensions = size of Russian vocabulary
##    ouput dimensions = embedding_size

## GRU layer:
##    input dimensions = embedding_size
##    hidden units = hidden_size
##    layers = num_layers
##    output dim = hidden_size

class Encoder(nn.Module):

  def __init__(self, hidden_size, embedding_size, num_layers=2, dropout=0.3):

    super(Encoder, self).__init__()

    # Basic network params
    self.hidden_size = hidden_size
    self.embedding_size = embedding_size
    self.num_layers = num_layers
    self.dropout = dropout

    # Embedding layer that will be shared with Decoder
    self.embedding = nn.Embedding(len(SRC.vocab), embedding_size)
    # GRU layer
    self.gru = nn.GRU(embedding_size, hidden_size,
                      num_layers=num_layers,
                      dropout=dropout)

  def forward(self, input_sequence):

    # Convert input_sequence to word embeddings
    embedded = self.embedding(input_sequence)

    outputs, hidden = self.gru(embedded)

    # The ouput of a GRU has shape -> (seq_len, batch, hidden_size)
    return outputs, hidden

###5.2 Attention Mechanism

In [16]:

class Attention(nn.Module):
  def __init__(self, hidden_size):
    super(Attention, self).__init__()
    self.hidden_size = hidden_size


  def dot_score(self, hidden_state, encoder_states):
    return torch.sum(hidden_state * encoder_states, dim=2)


  def forward(self, hidden, encoder_outputs, mask):

    attn_scores = self.dot_score(hidden, encoder_outputs)

    # Transpose max_length and batch_size dimensions
    attn_scores = attn_scores.t()

    # Apply mask so network does not attend <pad> tokens
    attn_scores = attn_scores.masked_fill(mask == 0, -1e5)

    # Return softmax over attention scores
    return F.softmax(attn_scores, dim=1).unsqueeze(1)

###5.3 Decoder Architecture

In [17]:



## embedding layer:
##    input dimensions = output_size (size of English vocabulary),
##    ouput dimensions = embedding_size

## GRU layer:
##    input dimensions = embedding_size
##    hidden units = hidden_size
##    layers = n_layers
##    output dim = hidden_size

## concat layer:
##    input dimensions = hidden_size * 2
##    output dimensions = hidden_size

## fully Connected layer:
##    input dimensions = hidden_size,
##    ouput dimensions = output_size (size of English vocabulary)

class Decoder(nn.Module):
  def __init__(self, embedding_size, hidden_size, output_size, n_layers=2, dropout=0.3):

    super(Decoder, self).__init__()

    # Basic network params
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout
    self.embedding = nn.Embedding(output_size, embedding_size)

    self.gru = nn.GRU(embedding_size, hidden_size, n_layers,
                      dropout=dropout)

    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.attn = Attention(hidden_size)

  def forward(self, current_token, hidden_state, encoder_outputs, mask):

    # convert current_token to word_embedding
    embedded = self.embedding(current_token)

    # Pass through GRU
    gru_output, hidden_state = self.gru(embedded, hidden_state)

    # Calculate attention weights
    attention_weights = self.attn(gru_output, encoder_outputs, mask)

    # Calculate context vector (weigthed average)
    context = attention_weights.bmm(encoder_outputs.transpose(0, 1))

    # Concatenate  context vector and GRU output
    gru_output = gru_output.squeeze(0)
    context = context.squeeze(1)
    concat_input = torch.cat((gru_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))

    # Pass concat_output to final output layer
    output = self.out(concat_output)

    # Return output and final hidden state
    return output, hidden_state

###5.4 Sequence-to-Sequence Architecture

In [18]:


class seq2seq(nn.Module):
  def __init__(self, embedding_size, hidden_size, vocab_size, device, pad_idx, eos_idx, sos_idx):
    super(seq2seq, self).__init__()

    # Embedding layer shared by encoder and decoder
    self.embedding = nn.Embedding(vocab_size, embedding_size)

    # Encoder network
    self.encoder = Encoder(hidden_size,
                            embedding_size,
                            num_layers=2,
                            dropout=0.3)

    # Decoder network
    self.decoder = Decoder(embedding_size,
                            hidden_size,
                            vocab_size,
                            n_layers=2,
                            dropout=0.3)


    # Indices of special tokens and hardware device
    self.pad_idx = pad_idx
    self.eos_idx = eos_idx
    self.sos_idx = sos_idx
    self.device = device

  def create_mask(self, input_sequence):
    return (input_sequence != self.pad_idx).permute(1, 0)


  def forward(self, input_sequence, output_sequence):

    # Unpack input_sequence tuple
    input_tokens = input_sequence[0]

    # Unpack output_tokens, or create an empty tensor for text generation
    if output_sequence is None:
      inference = True
      output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
    else:
      inference = False
      output_tokens = output_sequence[0]

    vocab_size = self.decoder.output_size
    batch_size = len(input_sequence[1])
    max_seq_len = len(output_tokens)

    # tensor to store decoder outputs
    outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)

    # pass input sequence to the encoder
    encoder_outputs, hidden = self.encoder(input_tokens)

    # first input to the decoder is the <sos> tokens
    output = output_tokens[0,:]

    # create mask
    mask = self.create_mask(input_tokens)


    # Step through the length of the output sequence one token at a time
    for t in range(1, max_seq_len):
      output = output.unsqueeze(0)

      output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
      outputs[t] = output

      if inference:
        output = output.max(1)[1]
      else:
        output = output_tokens[t]

      # If we're in inference mode, keep generating until we produce an
      # <eos> token
      if inference and output.item() == self.eos_idx:
        return outputs[:t]

    return outputs

#6. Train Seq2Seq Model

In [19]:



# extract special tokens
pad_idx = TRG.vocab.stoi['<pad>']
eos_idx = TRG.vocab.stoi['<eos>']
sos_idx = TRG.vocab.stoi['<sos>']

# Size of embedding_dim should match the dim of pre-trained word embeddings!
embedding_dim = 100
hidden_dim = 256
vocab_size = len(TRG.vocab)

In [20]:


model = seq2seq(embedding_dim,
                hidden_dim,
                vocab_size,
                device, pad_idx, eos_idx, sos_idx).to(device)

In [21]:


# print model architecture
model

seq2seq(
  (embedding): Embedding(4004, 100)
  (encoder): Encoder(
    (embedding): Embedding(4002, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(4004, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
    (concat): Linear(in_features=512, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=4004, bias=True)
    (attn): Attention()
  )
)

In [22]:



# Adam optimizer
optimizer = optim.Adam(model.parameters())

# cross entropy loss with softmax
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [23]:


def train(model, iterator, criterion, optimizer):
  # Put the model in training mode!
  model.train()

  epoch_loss = 0

  for idx, batch in notebook.tqdm(enumerate(iterator), total=len(iterator)):
    input_sequence = batch.rus
    output_sequence = batch.eng

    target_tokens = output_sequence[0]

    # zero out the gradient for the current batch
    optimizer.zero_grad()

    # Run the batch through our model
    output = model(input_sequence, output_sequence)

    # Throw it through our loss function
    output = output[1:].view(-1, output.shape[-1])
    target_tokens = target_tokens[1:].view(-1)

    loss = criterion(output, target_tokens)

    # Perform back-prop and calculate the gradient of our loss function
    loss.backward()

    # Update model parameters
    optimizer.step()

    epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [24]:


def evaluate(model, iterator, criterion):
  # Put the model in training mode!
  model.eval()

  epoch_loss = 0

  for idx, batch in notebook.tqdm(enumerate(iterator), total=len(iterator)):
    input_sequence = batch.rus
    output_sequence = batch.eng

    target_tokens = output_sequence[0]

    # Run the batch through our model
    output = model(input_sequence, output_sequence)

    # Throw it through our loss function
    output = output[1:].view(-1, output.shape[-1])
    target_tokens = target_tokens[1:].view(-1)

    loss = criterion(output, target_tokens)

    epoch_loss += loss.item()

  return epoch_loss / len(iterator)

In [25]:


# function to compute time taken by an epoch (in mm:ss)
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [26]:





N_EPOCHS = 10

best_valid_loss = float('inf')

# start model training
for epoch in range(N_EPOCHS):

  start_time = time.time()

  train_loss = train(model, train_iterator, criterion, optimizer)
  valid_loss = evaluate(model, valid_iterator, criterion)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  # compare validation loss
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'best_model.pt')

  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 01 | Time: 1m 27s
	Train Loss: 3.096 | Train PPL:  22.109
	 Val. Loss: 2.150 |  Val. PPL:   8.586


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 02 | Time: 1m 24s
	Train Loss: 1.884 | Train PPL:   6.578
	 Val. Loss: 1.651 |  Val. PPL:   5.214


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 03 | Time: 1m 24s
	Train Loss: 1.514 | Train PPL:   4.543
	 Val. Loss: 1.462 |  Val. PPL:   4.314


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 04 | Time: 1m 25s
	Train Loss: 1.329 | Train PPL:   3.777
	 Val. Loss: 1.373 |  Val. PPL:   3.945


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 05 | Time: 1m 23s
	Train Loss: 1.218 | Train PPL:   3.382
	 Val. Loss: 1.312 |  Val. PPL:   3.715


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 06 | Time: 1m 25s
	Train Loss: 1.138 | Train PPL:   3.121
	 Val. Loss: 1.285 |  Val. PPL:   3.614


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 07 | Time: 1m 25s
	Train Loss: 1.081 | Train PPL:   2.947
	 Val. Loss: 1.256 |  Val. PPL:   3.511


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 08 | Time: 1m 26s
	Train Loss: 1.033 | Train PPL:   2.809
	 Val. Loss: 1.242 |  Val. PPL:   3.464


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 09 | Time: 1m 25s
	Train Loss: 0.997 | Train PPL:   2.710
	 Val. Loss: 1.232 |  Val. PPL:   3.429


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 10 | Time: 1m 24s
	Train Loss: 0.967 | Train PPL:   2.629
	 Val. Loss: 1.225 |  Val. PPL:   3.405


#7. Model Inference

In [27]:


# load saved model weights
path = 'best_model.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

###7.1 Build Inference Function

In [28]:


def translate_sentence(model, sentence):
    model.eval()

    # tokenization
    tokenized = nlp_ru(sentence)
    # convert tokens to lowercase
    tokenized = [t.lower_ for t in tokenized]
    # convert tokens to integers
    int_tokenized = [SRC.vocab.stoi[t] for t in tokenized]

    # convert list to tensor
    sentence_length = torch.LongTensor([len(int_tokenized)]).to(model.device)
    tensor = torch.LongTensor(int_tokenized).unsqueeze(1).to(model.device)

    # get predictions
    translation_tensor_logits = model((tensor, sentence_length), None)

    # get token index with highest score
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    # convert indices (integers) to tokens
    translation = [TRG.vocab.itos[t] for t in translation_tensor]

    # Start at the first index.  We don't need to return the <sos> token...
    translation = translation[1:]
    return " ".join(translation)

In [29]:


sentence = "это новый"
response = translate_sentence(model, sentence)
print(response)

this is a new one


###7.2 Translate Russian Sentences in the Test Dataset

In [36]:
# read test file
test_df = pd.read_csv('nmt_test_translations.csv')

In [33]:
# attention based translations
attn_translations = [translate_sentence(model, sent) for sent in notebook.tqdm(test_df["rus"])]

  0%|          | 0/35122 [00:00<?, ?it/s]

In [34]:
test_df["attn_translations"] = attn_translations

In [35]:
# check translations
test_df.sample(20)

Unnamed: 0,rus,eng,translations,attn_translations
13286,что вас печалит,what are you sorry about,what are you hiding,what brought you
5922,наконец они приняли решение,at last they came to a decision,finally finally made a decision,they finally made a decision
11175,дети собирают цветы в саду,the kids are picking flowers in the garden,the flowers are in the garden of the garden,the children are <unk> in the garden
23881,мы его попробуем,we'll try it,we 'll try it,we 'll try it
32072,том хотел чтобы я спел,tom wanted me to sing,tom wanted me to sing,tom wanted me to sing
23463,очередь движется очень медленно,the line is moving very slowly,it 's very rare to get up in the summer,the line is very slow to eat slowly
2419,мы живём в доме,we live in a house,we live in the house,we live in a house
33555,ты хочешь сказать я трус,are you calling me a coward,do you want me to come,do you want me to say that
14075,у кого оно есть,who has it,who is it,who does it have
14222,сходи за кофе,go get coffee,go get some coffee,go get some coffee


# Model Deployment using Streamlit

In [37]:
# installing streamlit
!pip install -q streamlit

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [60]:
# creating the script
%%writefile russ_eng_attention.py

import streamlit as st
import re
import time
import math
import random
import numpy as np
import pandas as pd
import spacy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
from tqdm import notebook

# Set up the device for GPU usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load Spacy models
from spacy.lang.ru import Russian
nlp_ru = Russian()
nlp_en = spacy.load("en_core_web_sm", disable=["parser", "tagger", "ner"])

# Tokenization functions
def tokenize_ru(text):
    return [tok.text for tok in nlp_ru.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in nlp_en.tokenizer(text)]

# Define Fields
SRC = data.Field(tokenize=tokenize_ru, include_lengths=True, lower=True)
TRG = data.Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', include_lengths=True, lower=True)

fields = [('rus', SRC), ('eng', TRG)]

# Load data
nmt_data = data.TabularDataset(path="nmt_data.csv", format='csv', fields=fields)

# Build vocab
SRC.build_vocab(nmt_data, max_size=4000)
TRG.build_vocab(nmt_data, max_size=4000)

# Split data
train_data, val_data = nmt_data.split(split_ratio=0.8)

# Create iterators
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data),
    batch_size=64,
    sort_within_batch=True,
    sort_key=lambda x: len(x.rus),
    device=device
)

# Define the Encoder, Attention, Decoder, and Seq2Seq classes (same as provided)

class Encoder(nn.Module):
    def __init__(self, hidden_size, embedding_size, num_layers=2, dropout=0.3):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.embedding = nn.Embedding(len(SRC.vocab), embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers=num_layers, dropout=dropout)

    def forward(self, input_sequence):
        embedded = self.embedding(input_sequence)
        outputs, hidden = self.gru(embedded)
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size

    def dot_score(self, hidden_state, encoder_states):
        return torch.sum(hidden_state * encoder_states, dim=2)

    def forward(self, hidden, encoder_outputs, mask):
        attn_scores = self.dot_score(hidden, encoder_outputs)
        attn_scores = attn_scores.t()
        attn_scores = attn_scores.masked_fill(mask == 0, -1e5)
        return F.softmax(attn_scores, dim=1).unsqueeze(1)

class Decoder(nn.Module):
    def __init__(self, embedding_size, hidden_size, output_size, n_layers=2, dropout=0.3):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.embedding = nn.Embedding(output_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, n_layers, dropout=dropout)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.attn = Attention(hidden_size)

    def forward(self, current_token, hidden_state, encoder_outputs, mask):
        embedded = self.embedding(current_token)
        gru_output, hidden_state = self.gru(embedded, hidden_state)
        attention_weights = self.attn(gru_output, encoder_outputs, mask)
        context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
        gru_output = gru_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((gru_output, context), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output)
        return output, hidden_state

class seq2seq(nn.Module):
    def __init__(self, embedding_size, hidden_size, vocab_size, device, pad_idx, eos_idx, sos_idx):
        super(seq2seq, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.encoder = Encoder(hidden_size, embedding_size, num_layers=2, dropout=0.3)
        self.decoder = Decoder(embedding_size, hidden_size, vocab_size, n_layers=2, dropout=0.3)
        self.pad_idx = pad_idx
        self.eos_idx = eos_idx
        self.sos_idx = sos_idx
        self.device = device

    def create_mask(self, input_sequence):
        return (input_sequence != self.pad_idx).permute(1, 0)

    def forward(self, input_sequence, output_sequence):
        input_tokens = input_sequence[0]
        if output_sequence is None:
            inference = True
            output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
        else:
            inference = False
            output_tokens = output_sequence[0]
        vocab_size = self.decoder.output_size
        batch_size = len(input_sequence[1])
        max_seq_len = len(output_tokens)
        outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(input_tokens)
        output = output_tokens[0, :]
        mask = self.create_mask(input_tokens)
        for t in range(1, max_seq_len):
            output = output.unsqueeze(0)
            output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
            outputs[t] = output
            if inference:
                output = output.max(1)[1]
            else:
                output = output_tokens[t]
            if inference and output.item() == self.eos_idx:
                return outputs[:t]
        return outputs

# Load model
pad_idx = TRG.vocab.stoi['<pad>']
eos_idx = TRG.vocab.stoi['<eos>']
sos_idx = TRG.vocab.stoi['<sos>']
embedding_dim = 100
hidden_dim = 256
vocab_size = len(TRG.vocab)
model = seq2seq(embedding_dim, hidden_dim, vocab_size, device, pad_idx, eos_idx, sos_idx).to(device)
model.load_state_dict(torch.load('best_model.pt'))

def translate_sentence(model, sentence):
    model.eval()
    tokenized = nlp_ru(sentence)
    tokenized = [t.lower_ for t in tokenized]
    int_tokenized = [SRC.vocab.stoi[t] for t in tokenized]
    sentence_length = torch.LongTensor([len(int_tokenized)]).to(model.device)
    tensor = torch.LongTensor(int_tokenized).unsqueeze(1).to(model.device)
    translation_tensor_logits = model((tensor, sentence_length), None)
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    translation = [TRG.vocab.itos[t] for t in translation_tensor]
    translation = translation[1:]
    return " ".join(translation)



# Streamlit app with enhanced design
st.set_page_config(page_title="Russian to English Translator using Attention Mechanism")

st.markdown("""
    <style>

    .title {
        font-size: 2.5rem;
        color: #4CAF50;
    }


    .translate-button {
        background-color: #4CAF50;
        color: white;
        font-size: 1rem;
        padding: 10px 20px;
        border: none;
        border-radius: 5px;
        cursor: pointer;
    }
    </style>
""", unsafe_allow_html=True)

st.markdown('<div class="main">', unsafe_allow_html=True)
st.markdown('<h1 class="title">Russian to English Translator using Attention Mechanism</h1>', unsafe_allow_html=True)
st.markdown('<p class="subtitle">Enter a Russian sentence to translate it to English:</p>', unsafe_allow_html=True)


input_sentence = st.text_input("Russian Sentence")




placeholder = st.empty()
if st.button("Translate", key="translate_button", help="Click to translate the sentence"):
    placeholder.markdown('<div class="input-box">', unsafe_allow_html=True)
    translation = translate_sentence(model, input_sentence)
    placeholder.markdown(f'<p>{translation}</p>', unsafe_allow_html=True)
    placeholder.markdown('</div>', unsafe_allow_html=True)
    st.markdown("<p style='font-size: 20px;'>Translated Sentence:</p>", unsafe_allow_html=True)
    st.markdown(f"<p style='color: green; font-size: 18px; font-weight: bold;'>{translation}</p>", unsafe_allow_html=True)
    # st.write("Translated Sentence:")
    # st.write(translation)
st.markdown('</div>', unsafe_allow_html=True)


Overwriting russ_eng_attention.py


In [41]:
# running the app
!streamlit run russ_eng_attention.py &>/dev/null&

In [42]:
# installing pyngrok
!pip install -q pyngrok

In [43]:


# making the locally-hosted web application to be publicly accessible
from pyngrok import ngrok

ngrok.set_auth_token ('2czZtwjv30n8NKzmr9ddbhKECCe_5TwyAq5vC931maAmdJ8h4')

public_url = ngrok.connect('8501')
public_url



<NgrokTunnel: "https://0e65-34-126-115-159.ngrok-free.app" -> "http://localhost:8501">

In [None]:





# import re
# import time
# import math
# import random

# import numpy as np
# import pandas as pd
# import spacy

# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# import torch.optim as optim
# from torchtext import data

# from tqdm import notebook
# pd.set_option('display.max_colwidth', 200)
# # check GPU availability
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print(device)
# # import Russian spacy model to tokenize Russian text
# from spacy.lang.ru import Russian
# # spacy object for Russian
# nlp_ru = Russian()

# # spacy object for English
# nlp_en = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])
# ## functions to perform tokenization

# # tokenizes Russian text from a string into a list of tokens
# def tokenize_ru(text):
#   return [tok.text for tok in nlp_ru.tokenizer(text)]

# # tokenizes English text from a string into a list of tokens
# def tokenize_en(text):
#   return [tok.text for tok in nlp_en.tokenizer(text)]
# ## Create Field objects

# # Field object for Russian
# SRC = data.Field(tokenize = tokenize_ru,
#                  include_lengths = True,
#                  lower = True)

# # Field object for English
# TRG = data.Field(tokenize = tokenize_en,
#                  init_token = '<sos>', # "start" token
#                  eos_token = '<eos>', # "" token
#                  include_lengths = True,
#                  lower = True)

# fields = [('rus', SRC), ('eng', TRG)]
# # importing data from csv
# nmt_data = data.TabularDataset(path="nmt_data.csv", format='csv', fields=fields)
# # build vocabulary for Russian sequences
# SRC.build_vocab(nmt_data, max_size=4000)

# # build vocabulary for English sequences
# TRG.build_vocab(nmt_data, max_size=4000)
# # check size of vocabulary
# len(SRC.vocab), len(TRG.vocab)
# # Split our dialogue data into training, validation, and test sets
# train_data, val_data = nmt_data.split(split_ratio=0.8)
# # Create a set of iterators for each split
# train_iterator, valid_iterator = data.BucketIterator.splits(
#     (train_data, val_data),
#     batch_size = 64,
#     sort_within_batch = True,
#     sort_key = lambda x:len(x.rus),
#     device = device)
# ## embedding layer:
# ##    input dimensions = size of Russian vocabulary
# ##    ouput dimensions = embedding_size

# ## GRU layer:
# ##    input dimensions = embedding_size
# ##    hidden units = hidden_size
# ##    layers = num_layers
# ##    output dim = hidden_size

# class Encoder(nn.Module):

#   def __init__(self, hidden_size, embedding_size, num_layers=2, dropout=0.3):

#     super(Encoder, self).__init__()

#     # Basic network params
#     self.hidden_size = hidden_size
#     self.embedding_size = embedding_size
#     self.num_layers = num_layers
#     self.dropout = dropout

#     # Embedding layer that will be shared with Decoder
#     self.embedding = nn.Embedding(len(SRC.vocab), embedding_size)
#     # GRU layer
#     self.gru = nn.GRU(embedding_size, hidden_size,
#                       num_layers=num_layers,
#                       dropout=dropout)

#   def forward(self, input_sequence):

#     # Convert input_sequence to word embeddings
#     embedded = self.embedding(input_sequence)

#     outputs, hidden = self.gru(embedded)

#     # The ouput of a GRU has shape -> (seq_len, batch, hidden_size)
#     return outputs, hidden
# class Attention(nn.Module):
#   def __init__(self, hidden_size):
#     super(Attention, self).__init__()
#     self.hidden_size = hidden_size


#   def dot_score(self, hidden_state, encoder_states):
#     return torch.sum(hidden_state * encoder_states, dim=2)


#   def forward(self, hidden, encoder_outputs, mask):

#     attn_scores = self.dot_score(hidden, encoder_outputs)

#     # Transpose max_length and batch_size dimensions
#     attn_scores = attn_scores.t()

#     # Apply mask so network does not attend <pad> tokens
#     attn_scores = attn_scores.masked_fill(mask == 0, -1e5)

#     # Return softmax over attention scores
#     return F.softmax(attn_scores, dim=1).unsqueeze(1)


# ## embedding layer:
# ##    input dimensions = output_size (size of English vocabulary),
# ##    ouput dimensions = embedding_size

# ## GRU layer:
# ##    input dimensions = embedding_size
# ##    hidden units = hidden_size
# ##    layers = n_layers
# ##    output dim = hidden_size

# ## concat layer:
# ##    input dimensions = hidden_size * 2
# ##    output dimensions = hidden_size

# ## fully Connected layer:
# ##    input dimensions = hidden_size,
# ##    ouput dimensions = output_size (size of English vocabulary)

# class Decoder(nn.Module):
#   def __init__(self, embedding_size, hidden_size, output_size, n_layers=2, dropout=0.3):

#     super(Decoder, self).__init__()

#     # Basic network params
#     self.hidden_size = hidden_size
#     self.output_size = output_size
#     self.n_layers = n_layers
#     self.dropout = dropout
#     self.embedding = nn.Embedding(output_size, embedding_size)

#     self.gru = nn.GRU(embedding_size, hidden_size, n_layers,
#                       dropout=dropout)

#     self.concat = nn.Linear(hidden_size * 2, hidden_size)
#     self.out = nn.Linear(hidden_size, output_size)
#     self.attn = Attention(hidden_size)

#   def forward(self, current_token, hidden_state, encoder_outputs, mask):

#     # convert current_token to word_embedding
#     embedded = self.embedding(current_token)

#     # Pass through GRU
#     gru_output, hidden_state = self.gru(embedded, hidden_state)

#     # Calculate attention weights
#     attention_weights = self.attn(gru_output, encoder_outputs, mask)

#     # Calculate context vector (weigthed average)
#     context = attention_weights.bmm(encoder_outputs.transpose(0, 1))

#     # Concatenate  context vector and GRU output
#     gru_output = gru_output.squeeze(0)
#     context = context.squeeze(1)
#     concat_input = torch.cat((gru_output, context), 1)
#     concat_output = torch.tanh(self.concat(concat_input))

#     # Pass concat_output to final output layer
#     output = self.out(concat_output)

#     # Return output and final hidden state
#     return output, hidden_state

# class seq2seq(nn.Module):
#   def __init__(self, embedding_size, hidden_size, vocab_size, device, pad_idx, eos_idx, sos_idx):
#     super(seq2seq, self).__init__()

#     # Embedding layer shared by encoder and decoder
#     self.embedding = nn.Embedding(vocab_size, embedding_size)

#     # Encoder network
#     self.encoder = Encoder(hidden_size,
#                             embedding_size,
#                             num_layers=2,
#                             dropout=0.3)

#     # Decoder network
#     self.decoder = Decoder(embedding_size,
#                             hidden_size,
#                             vocab_size,
#                             n_layers=2,
#                             dropout=0.3)


#     # Indices of special tokens and hardware device
#     self.pad_idx = pad_idx
#     self.eos_idx = eos_idx
#     self.sos_idx = sos_idx
#     self.device = device

#   def create_mask(self, input_sequence):
#     return (input_sequence != self.pad_idx).permute(1, 0)


#   def forward(self, input_sequence, output_sequence):

#     # Unpack input_sequence tuple
#     input_tokens = input_sequence[0]

#     # Unpack output_tokens, or create an empty tensor for text generation
#     if output_sequence is None:
#       inference = True
#       output_tokens = torch.zeros((100, input_tokens.shape[1])).long().fill_(self.sos_idx).to(self.device)
#     else:
#       inference = False
#       output_tokens = output_sequence[0]

#     vocab_size = self.decoder.output_size
#     batch_size = len(input_sequence[1])
#     max_seq_len = len(output_tokens)

#     # tensor to store decoder outputs
#     outputs = torch.zeros(max_seq_len, batch_size, vocab_size).to(self.device)

#     # pass input sequence to the encoder
#     encoder_outputs, hidden = self.encoder(input_tokens)

#     # first input to the decoder is the <sos> tokens
#     output = output_tokens[0,:]

#     # create mask
#     mask = self.create_mask(input_tokens)


#     # Step through the length of the output sequence one token at a time
#     for t in range(1, max_seq_len):
#       output = output.unsqueeze(0)

#       output, hidden = self.decoder(output, hidden, encoder_outputs, mask)
#       outputs[t] = output

#       if inference:
#         output = output.max(1)[1]
#       else:
#         output = output_tokens[t]

#       # If we're in inference mode, keep generating until we produce an
#       # <eos> token
#       if inference and output.item() == self.eos_idx:
#         return outputs[:t]

#     return outputs

# # extract special tokens
# pad_idx = TRG.vocab.stoi['<pad>']
# eos_idx = TRG.vocab.stoi['<eos>']
# sos_idx = TRG.vocab.stoi['<sos>']

# # Size of embedding_dim should match the dim of pre-trained word embeddings!
# embedding_dim = 100
# hidden_dim = 256
# vocab_size = len(TRG.vocab)

# model = seq2seq(embedding_dim,
#                 hidden_dim,
#                 vocab_size,
#                 device, pad_idx, eos_idx, sos_idx).to(device)
# # print model architecture
# model

# # Adam optimizer
# optimizer = optim.Adam(model.parameters())

# # cross entropy loss with softmax
# criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

# def train(model, iterator, criterion, optimizer):
#   # Put the model in training mode!
#   model.train()

#   epoch_loss = 0

#   for idx, batch in notebook.tqdm(enumerate(iterator), total=len(iterator)):
#     input_sequence = batch.rus
#     output_sequence = batch.eng

#     target_tokens = output_sequence[0]

#     # zero out the gradient for the current batch
#     optimizer.zero_grad()

#     # Run the batch through our model
#     output = model(input_sequence, output_sequence)

#     # Throw it through our loss function
#     output = output[1:].view(-1, output.shape[-1])
#     target_tokens = target_tokens[1:].view(-1)

#     loss = criterion(output, target_tokens)

#     # Perform back-prop and calculate the gradient of our loss function
#     loss.backward()

#     # Update model parameters
#     optimizer.step()

#     epoch_loss += loss.item()

#   return epoch_loss / len(iterator)

# def evaluate(model, iterator, criterion):
#   # Put the model in training mode!
#   model.eval()

#   epoch_loss = 0

#   for idx, batch in notebook.tqdm(enumerate(iterator), total=len(iterator)):
#     input_sequence = batch.rus
#     output_sequence = batch.eng

#     target_tokens = output_sequence[0]

#     # Run the batch through our model
#     output = model(input_sequence, output_sequence)

#     # Throw it through our loss function
#     output = output[1:].view(-1, output.shape[-1])
#     target_tokens = target_tokens[1:].view(-1)

#     loss = criterion(output, target_tokens)

#     epoch_loss += loss.item()

#   return epoch_loss / len(iterator)

# # function to compute time taken by an epoch (in mm:ss)
# def epoch_time(start_time, end_time):
#   elapsed_time = end_time - start_time
#   elapsed_mins = int(elapsed_time / 60)
#   elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
#   return elapsed_mins, elapsed_secs




# N_EPOCHS = 10

# best_valid_loss = float('inf')

# # start model training
# for epoch in range(N_EPOCHS):

#   start_time = time.time()

#   train_loss = train(model, train_iterator, criterion, optimizer)
#   valid_loss = evaluate(model, valid_iterator, criterion)

#   end_time = time.time()

#   epoch_mins, epoch_secs = epoch_time(start_time, end_time)

#   # compare validation loss
#   if valid_loss < best_valid_loss:
#     best_valid_loss = valid_loss
#     torch.save(model.state_dict(), 'best_model.pt')

#   print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
#   print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
#   print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
# # load saved model weights
# path = 'best_model.pt'
# model.load_state_dict(torch.load(path))

# def translate_sentence(model, sentence):
#     model.eval()

#     # tokenization
#     tokenized = nlp_ru(sentence)
#     # convert tokens to lowercase
#     tokenized = [t.lower_ for t in tokenized]
#     # convert tokens to integers
#     int_tokenized = [SRC.vocab.stoi[t] for t in tokenized]

#     # convert list to tensor
#     sentence_length = torch.LongTensor([len(int_tokenized)]).to(model.device)
#     tensor = torch.LongTensor(int_tokenized).unsqueeze(1).to(model.device)

#     # get predictions
#     translation_tensor_logits = model((tensor, sentence_length), None)

#     # get token index with highest score
#     translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
#     # convert indices (integers) to tokens
#     translation = [TRG.vocab.itos[t] for t in translation_tensor]

#     # Start at the first index.  We don't need to return the <sos> token...
#     translation = translation[1:]
#     return " ".join(translation)

# sentence = "это новый"
# response = translate_sentence(model, sentence)
# print(response)