In [10]:
import re
import time
import math
import random
import numpy as np
import pandas as pd
import spacy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data

from tqdm import notebook
pd.set_option('display.max_colwidth', 200)

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'torchtext.legacy'

In [2]:
# dependency for spaCy Eng tokenizer
!pip3 install pymorphy2



In [3]:
# check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Создаем Field объекты

In [4]:
# import Russian spacy model to tokenize Russian text
from spacy.lang.en import English

In [5]:
# spacy object for Russian
nlp_en = English()

# spacy object for English
nlp_ru = spacy.load("en_core_web_sm", disable = ["parser", "tagger", "ner"])

In [6]:
## functions to perform tokenization

# tokenizes Russian text from a string into a list of tokens
def tokenize_ru(text):
  return [tok.text for tok in nlp_ru.tokenizer(text)]

# tokenizes English text from a string into a list of tokens
def tokenize_en(text):
  return [tok.text for tok in nlp_en.tokenizer(text)]

In [9]:
## Create Field objects
#from torchtext.data import Field
# Field object for Enlish
SRC = data.Field(tokenize = tokenize_en, 
                 include_lengths = True, 
                 lower = True)

# Field object for Russian
TRG = data.Field(tokenize = tokenize_ru, 
                 init_token = '<sos>', # "start" token
                 eos_token = '<eos>', # "" token
                 include_lengths = True, 
                 lower = True)

fields = [('eng', SRC), ('ru', TRG)]

AttributeError: module 'torchtext.data' has no attribute 'Field'

# Подготовка данных

In [9]:
# importing data from csv
nmt_data = data.TabularDataset(path="../Rus", format='tsv', fields=fields)

In [10]:
# build vocabulary for Russian sequences
SRC.build_vocab(nmt_data, max_size=4000)

# build vocabulary for English sequences
TRG.build_vocab(nmt_data, max_size=4000)

In [11]:
# check size of vocabulary
len(SRC.vocab), len(TRG.vocab)

(4002, 4004)

# Создаем загрузчики данных

In [12]:
# Split our dialogue data into training, validation, and test sets
train_data, val_data = nmt_data.split(split_ratio=0.8)

In [13]:
# Create a set of iterators for each split
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data), 
    batch_size = 64, 
    sort_within_batch = True, 
    sort_key = lambda x:len(x.rus),
    device = device)

# Архитектура Encoder

In [15]:
class Encoder(nn.Module):
  
  def __init__(self, hidden_size, embedding_size, num_layers=2, dropout=0.3):
    
    super(Encoder, self).__init__()
    
    # Basic network params
    self.hidden_size = hidden_size
    self.embedding_size = embedding_size
    self.num_layers = num_layers
    self.dropout = dropout
    
    # Embedding layer that will be shared with Decoder
    self.embedding = nn.Embedding(len(SRC.vocab), embedding_size)
    # GRU layer
    self.gru = nn.GRU(embedding_size, hidden_size,
                      num_layers=num_layers,
                      dropout=dropout)
      
  def forward(self, input_sequence):
      
    # Convert input_sequence to word embeddings
    embedded = self.embedding(input_sequence)
            
    outputs, hidden = self.gru(embedded)
    
    # The ouput of a GRU has shape -> (seq_len, batch, hidden_size)
    return outputs, hidden

# Механизм Attention

In [17]:
class Attention(nn.Module):
  def __init__(self, hidden_size):
    super(Attention, self).__init__()        
    self.hidden_size = hidden_size
      
    
  def dot_score(self, hidden_state, encoder_states):
    return torch.sum(hidden_state * encoder_states, dim=2)
  
          
  def forward(self, hidden, encoder_outputs, mask):
      
    attn_scores = self.dot_score(hidden, encoder_outputs)
    
    # Transpose max_length and batch_size dimensions
    attn_scores = attn_scores.t()
    
    # Apply mask so network does not attend <pad> tokens        
    attn_scores = attn_scores.masked_fill(mask == 0, -1e5)
    
    # Return softmax over attention scores      
    return F.softmax(attn_scores, dim=1).unsqueeze(1)

# Архитектура Decoder

In [18]:
class Decoder(nn.Module):
  def __init__(self, embedding_size, hidden_size, output_size, n_layers=2, dropout=0.3):
      
    super(Decoder, self).__init__()
    
    # Basic network params
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.n_layers = n_layers
    self.dropout = dropout
    self.embedding = nn.Embedding(output_size, embedding_size)
            
    self.gru = nn.GRU(embedding_size, hidden_size, n_layers, 
                      dropout=dropout)
    
    self.concat = nn.Linear(hidden_size * 2, hidden_size)
    self.out = nn.Linear(hidden_size, output_size)
    self.attn = Attention(hidden_size)
      
  def forward(self, current_token, hidden_state, encoder_outputs, mask):
    
    # convert current_token to word_embedding
    embedded = self.embedding(current_token)
    
    # Pass through GRU
    gru_output, hidden_state = self.gru(embedded, hidden_state)
    
    # Calculate attention weights
    attention_weights = self.attn(gru_output, encoder_outputs, mask)
    
    # Calculate context vector (weigthed average)
    context = attention_weights.bmm(encoder_outputs.transpose(0, 1))
    
    # Concatenate  context vector and GRU output
    gru_output = gru_output.squeeze(0)
    context = context.squeeze(1)
    concat_input = torch.cat((gru_output, context), 1)
    concat_output = torch.tanh(self.concat(concat_input))
    
    # Pass concat_output to final output layer
    output = self.out(concat_output)
    
    # Return output and final hidden state
    return output, hidden_state

# Архитектура Sequence-to-Sequence

# Тренировка Seq2Seq модели

In [21]:
# extract special tokens
pad_idx = TRG.vocab.stoi['<pad>']
eos_idx = TRG.vocab.stoi['<eos>']
sos_idx = TRG.vocab.stoi['<sos>']

# Size of embedding_dim should match the dim of pre-trained word embeddings!
embedding_dim = 100
hidden_dim = 256
vocab_size = len(TRG.vocab)

In [22]:
model = seq2seq(embedding_dim,
                hidden_dim, 
                vocab_size, 
                device, pad_idx, eos_idx, sos_idx).to(device)

In [23]:
# print model architecture
model

seq2seq(
  (embedding): Embedding(4004, 100)
  (encoder): Encoder(
    (embedding): Embedding(4002, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
  )
  (decoder): Decoder(
    (embedding): Embedding(4004, 100)
    (gru): GRU(100, 256, num_layers=2, dropout=0.3)
    (concat): Linear(in_features=512, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=4004, bias=True)
    (attn): Attention()
  )
)

In [24]:
# Adam optimizer
optimizer = optim.Adam(model.parameters())

# cross entropy loss with softmax
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [25]:
def train(model, iterator, criterion, optimizer):
  # Put the model in training mode!
  model.train()
  
  epoch_loss = 0
  
  for idx, batch in notebook.tqdm(enumerate(iterator), total=len(iterator)):
    input_sequence = batch.rus
    output_sequence = batch.eng

    target_tokens = output_sequence[0]

    # zero out the gradient for the current batch
    optimizer.zero_grad()

    # Run the batch through our model
    output = model(input_sequence, output_sequence)

    # Throw it through our loss function
    output = output[1:].view(-1, output.shape[-1])
    target_tokens = target_tokens[1:].view(-1)

    loss = criterion(output, target_tokens)

    # Perform back-prop and calculate the gradient of our loss function
    loss.backward()

    # Update model parameters
    optimizer.step()

    epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

In [26]:
def evaluate(model, iterator, criterion):
  # Put the model in training mode!
  model.eval()
  
  epoch_loss = 0
  
  for idx, batch in notebook.tqdm(enumerate(iterator), total=len(iterator)):
    input_sequence = batch.rus
    output_sequence = batch.eng

    target_tokens = output_sequence[0]

    # Run the batch through our model
    output = model(input_sequence, output_sequence)

    # Throw it through our loss function
    output = output[1:].view(-1, output.shape[-1])
    target_tokens = target_tokens[1:].view(-1)

    loss = criterion(output, target_tokens)

    epoch_loss += loss.item()
      
  return epoch_loss / len(iterator)

In [27]:
# function to compute time taken by an epoch (in mm:ss)
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [28]:
N_EPOCHS = 30

best_valid_loss = float('inf')

# start model training
for epoch in range(N_EPOCHS):
    
  start_time = time.time()
  
  train_loss = train(model, train_iterator, criterion, optimizer)
  valid_loss = evaluate(model, valid_iterator, criterion)
  
  end_time = time.time()
  
  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  
  # compare validation loss
  if valid_loss < best_valid_loss:
    best_valid_loss = valid_loss
    torch.save(model.state_dict(), 'best_model.pt')
  
  print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss: {train_loss:.3f}')
  print(f'\t Val. Loss: {valid_loss:.3f}')

  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 01 | Time: 1m 23s
	Train Loss: 3.055
	 Val. Loss: 2.146


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 02 | Time: 1m 22s
	Train Loss: 1.867
	 Val. Loss: 1.641


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 03 | Time: 1m 23s
	Train Loss: 1.499
	 Val. Loss: 1.452


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 04 | Time: 1m 23s
	Train Loss: 1.316
	 Val. Loss: 1.358


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 05 | Time: 1m 23s
	Train Loss: 1.203
	 Val. Loss: 1.300


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 06 | Time: 1m 23s
	Train Loss: 1.123
	 Val. Loss: 1.267


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 07 | Time: 1m 24s
	Train Loss: 1.064
	 Val. Loss: 1.244


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 08 | Time: 1m 22s
	Train Loss: 1.017
	 Val. Loss: 1.235


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 09 | Time: 1m 22s
	Train Loss: 0.981
	 Val. Loss: 1.225


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 10 | Time: 1m 21s
	Train Loss: 0.948
	 Val. Loss: 1.213


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 11 | Time: 1m 22s
	Train Loss: 0.921
	 Val. Loss: 1.207


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 12 | Time: 1m 22s
	Train Loss: 0.899
	 Val. Loss: 1.199


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 13 | Time: 1m 23s
	Train Loss: 0.878
	 Val. Loss: 1.205


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 14 | Time: 1m 23s
	Train Loss: 0.862
	 Val. Loss: 1.203


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 15 | Time: 1m 24s
	Train Loss: 0.849
	 Val. Loss: 1.202


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 16 | Time: 1m 22s
	Train Loss: 0.833
	 Val. Loss: 1.202


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 17 | Time: 1m 23s
	Train Loss: 0.821
	 Val. Loss: 1.207


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 18 | Time: 1m 23s
	Train Loss: 0.811
	 Val. Loss: 1.205


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 19 | Time: 1m 23s
	Train Loss: 0.801
	 Val. Loss: 1.211


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 20 | Time: 1m 23s
	Train Loss: 0.793
	 Val. Loss: 1.213


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 21 | Time: 1m 24s
	Train Loss: 0.784
	 Val. Loss: 1.211


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 22 | Time: 1m 24s
	Train Loss: 0.776
	 Val. Loss: 1.213


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 23 | Time: 1m 24s
	Train Loss: 0.770
	 Val. Loss: 1.215


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 24 | Time: 1m 23s
	Train Loss: 0.762
	 Val. Loss: 1.216


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 25 | Time: 1m 23s
	Train Loss: 0.758
	 Val. Loss: 1.219


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 26 | Time: 1m 23s
	Train Loss: 0.752
	 Val. Loss: 1.226


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 27 | Time: 1m 23s
	Train Loss: 0.746
	 Val. Loss: 1.226


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 28 | Time: 1m 24s
	Train Loss: 0.743
	 Val. Loss: 1.227


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 29 | Time: 1m 23s
	Train Loss: 0.737
	 Val. Loss: 1.233


  0%|          | 0/2339 [00:00<?, ?it/s]

  0%|          | 0/585 [00:00<?, ?it/s]

Epoch: 30 | Time: 1m 23s
	Train Loss: 0.734
	 Val. Loss: 1.231


# Сохранение модели

In [29]:
# load saved model weights
path = 'best_model.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

In [30]:
def translate_sentence(model, sentence):
    model.eval()
    
    # tokenization
    tokenized = nlp_ru(sentence) 
    # convert tokens to lowercase
    tokenized = [t.lower_ for t in tokenized]
    # convert tokens to integers
    int_tokenized = [SRC.vocab.stoi[t] for t in tokenized] 
    
    # convert list to tensor
    sentence_length = torch.LongTensor([len(int_tokenized)]).to(model.device) 
    tensor = torch.LongTensor(int_tokenized).unsqueeze(1).to(model.device) 
    
    # get predictions
    translation_tensor_logits = model((tensor, sentence_length), None) 
    
    # get token index with highest score
    translation_tensor = torch.argmax(translation_tensor_logits.squeeze(1), 1)
    # convert indices (integers) to tokens
    translation = [TRG.vocab.itos[t] for t in translation_tensor]
 
    # Start at the first index.  We don't need to return the <sos> token...
    translation = translation[1:]
    return " ".join(translation)

In [31]:
sentence = "это новый"
response = translate_sentence(model, sentence)
print(response)

is it new


# Перевод в Test 

In [32]:
# read test file 
test_df = pd.read_csv('../input/englishrussiansentencepairs/data/translation.csv')

In [33]:
# attention based translations
attn_translations = [translate_sentence(model, sent) for sent in notebook.tqdm(test_df["rus"])]

  0%|          | 0/46668 [00:00<?, ?it/s]

In [1]:
test_df["attn_translations"] = attn_translations

NameError: name 'attn_translations' is not defined

In [None]:
# check translations
test_df.sample(20)