# HW1: seq2seq nmt

**Homework Goals**

1. Get familiar with text data preparation
2. Learn to work with RNN
3. Train the model to translate `en-->ru`.



In [392]:
import os
import re
import random
import logging
import numpy as np
import unicodedata
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from itertools import islice
from collections import defaultdict
from typing import Union, Tuple, List, Dict
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import warnings
warnings.filterwarnings("ignore")

## Naive way of texts representation:

0. Normalize spelling
1. Filter out all special characters
2. Split by spaces, do *naive tokenization*

In [2]:
class SeqPreproc:
    def __init__(self) -> None:
        self.raw_alphabet = set()
        self.alphabet = set()
    
    @staticmethod
    def normalize(seq: str):
        return "".join(char for char in unicodedata.normalize('NFD', seq) if unicodedata.category(char) != 'Mn')

    def preprocess(self, seq: str, add_sym: bool = True) -> str:
        # adding raw symbols
        if add_sym:
            self.raw_alphabet.update(seq)
        
        # clean up sequence
        seq = self.normalize(seq.lower().strip())
        seq = re.sub(r"[^a-zа-я?.,!]+", " ", seq)
        seq = re.sub(r"([.!?])", r" \1", seq)
        
        if add_sym:
            self.alphabet.update(seq)
        return seq


with open("eng-rus.txt", mode="r", encoding="utf-8") as f:
    sp = SeqPreproc()
    pairs = [tuple(map(sp.preprocess, line.split("\t"))) for line in f.readlines()]

print(f"Alphabet before preprocessing (size - {len(sp.raw_alphabet)}):")
print(*sorted(sp.raw_alphabet), "\n")
print(f"Alphabet after preprocessing (size - {len(sp.alphabet)}):")
print(*sorted(sp.alphabet), "\n")
print("Pairs (few examples):")
print(*pairs[:10])
print(f"Total pairs qantity: {len(pairs)}")

Alphabet before preprocessing (size - 174):

   ! " $ % & ' ( ) + , - . / 0 1 2 3 4 5 6 7 8 9 : ; ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z   « ° º » ã ç é ê î ï ó ö ú ü ́ Ё А Б В Г Д Е Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ь Э Ю Я а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я ё ׁ ​ – — ― ‘ ’ …   ‽ ₂ € № 

Alphabet after preprocessing (size - 62):
  ! , . ? a b c d e f g h i j k l m n o p q r s t u v w x y z а б в г д е ж з и к л м н о п р с т у ф х ц ч ш щ ъ ы ь э ю я 

Pairs (few examples):
('go .', 'иди .') ('go .', 'идите .') ('hi .', 'здравствуите .') ('hi .', 'привет !') ('hi .', 'хаи .') ('hi .', 'здрасте .') ('hi .', 'здорово !') ('run !', 'беги !') ('run !', 'бегите !') ('run .', 'беги !')
Total pairs qantity: 336666


Each word will be assigned a number + we will need special tokens for the beginning and end of the sequence and for unknown words.
`<SOS>, <EOS>, <UNK>`

We have two languages, to work with each we need functions for translating from words to numbers and vice versa.

It is proposed to implement these functions as dictionaries. Allocate the first 4 numbers for special tokens

**(1 point)** Implement the dictionary building function, the function takes a list of strings (normalized sentences, can be splited by spaces) as input. Organize the dictionary in a reasonable way so that rare words can be thrown out if necessary.

In [5]:
COMMON_TOKENS = ['PAD', 'SOS', 'EOS', 'UNK']


def build_vocabs(sents: tuple, max_size: int = 1000, special_tokens: list = None) -> Tuple[dict]:
    vocab = dict()
    
    for seq in sents:
        for token in seq.split():
            vocab[token] = vocab.get(token, 0) + 1
            
    vocab = {token: qty for token, qty in sorted(vocab.items(), key=lambda tup: tup[1], reverse=True)}
    tokens = special_tokens.copy() if special_tokens else list()
    tokens.extend(list(islice(vocab.keys(), max_size - len(special_tokens))) )
    tok2idx, idx2tok = \
        dict(zip(tokens, range(len(tokens)))), dict(zip(range(len(tokens)), tokens))
    return tok2idx, idx2tok


eng, rus = list(zip(*pairs))
rus2idx, idx2rus = build_vocabs(rus, max_size=10000, special_tokens=COMMON_TOKENS)
eng2idx, idx2eng = build_vocabs(eng, max_size=5000 , special_tokens=COMMON_TOKENS)

print(len(rus2idx), len(idx2rus))
print(len(eng2idx), len(idx2eng))

10000 10000
5000 5000


In [6]:
def sentence2idx(seq: str, tok2idx: dict) -> list():
    """
    Takes sentence as input and returns sequence of tokens indexes
    """
    tokens = SeqPreproc().preprocess(seq=seq).split()
    unk = tok2idx.get("UNK")
    return [tok2idx.get("SOS")] + [tok2idx.get(token, unk) for token in tokens] + [tok2idx.get("EOS")]


def idx2sentence(seq: list, idx2tok: dict) -> str:
    """
    Takes sequence of tokens indexes as input and returns sentence
    """
    return " ".join(idx2tok.get(idx) for idx in seq)

# check the consistency of the transformations
x = sentence2idx('Привет мир!', rus2idx)
print(x)
print(idx2sentence(seq=x, idx2tok=idx2rus))

x = sentence2idx('Hello world!', eng2idx)
print(x)
print(idx2sentence(seq=x, idx2tok=idx2eng))

[1, 2539, 1264, 83, 2]
SOS привет мир ! EOS
[1, 1960, 439, 174, 2]
SOS hello world ! EOS


## Dealing with arbitrary length sequences in pytorch

We need to be able to generate batches of `[bs, 1, seq_len]` tensors.
But in our dataset, the samples are of different lengths:

- we could cut everything down to the minimum length
- padd to maximum length
- choose some average length

**(1 point)** Split the dataset on train and validate:

In [105]:
# make a dataset with encoded pairs:
class EngRusDataset(Dataset):
    def __init__(self, pairs: List[tuple], pad_to: int = None, pad_value: int = 0, 
                 pad_left: bool = False) -> None:
        self.pairs = pairs
        self.pad_to = pad_to
        self.pad_value = pad_value
        self.pad_left = pad_left
    
    def __len__(self) -> int:
        return len(self.pairs)
    
    def transform(self, seq: list) -> torch.tensor:
        if self.pad_to is not None and len(seq) != self.pad_to:
            if len(seq) > self.pad_to:
                seq = seq[:self.pad_to]
            else:
                n_pads = self.pad_to - len(seq)
                seq = [self.pad_value]*n_pads*self.pad_left + \
                      seq + \
                      [self.pad_value]*n_pads*(not self.pad_left)       
        return torch.tensor(seq, dtype=torch.int)
        
    def __getitem__(self, item: int) -> Dict:
        eng, rus = self.pairs[item]
        eng, rus = self.transform(seq=eng), self.transform(seq=rus)
        return dict(
            eng=eng,
            rus=rus,
        )

encoded = []
for eng, rus in tqdm(pairs):
    a = sentence2idx(eng, eng2idx)
    b = sentence2idx(rus, rus2idx)
    encoded.append((a, b))


train_pairs, eval_pairs = train_test_split(encoded, train_size=0.8, random_state=42, shuffle=True)
trainset = EngRusDataset(pairs=train_pairs, pad_to=8, pad_left=True)
evalset = EngRusDataset(pairs=eval_pairs, pad_to=8, pad_left=True)

print("Train size:", trainset.__len__(), "\nEval size:", evalset.__len__())

HBox(children=(FloatProgress(value=0.0, max=336666.0), HTML(value='')))


Train size: 269332 
Eval size: 67334


Let's build a naive DataLoader and check how it makes batches:

In [106]:
trainloader = DataLoader(trainset, batch_size=8, shuffle=True)
it = iter(trainloader)

In [108]:
batch = next(it)['eng']
batch

tensor([[   0,    1,    5,  140,   95,  381,    4,    2],
        [   1,  131,  154,  476, 3074,   48,  491,  175],
        [   0,    1,   22,   11,  472,  297,    4,    2],
        [   1,    5,   41,    5,   49,  111,   13,    4],
        [   1,    6,   86,   11,   29,   23,   28,  319],
        [   1,    6,  255,   19,  311,   12,  547,    4],
        [   0,    1,    5,  222,   11,  911,    4,    2],
        [   1,    5,   22,   11,  259,   45,   63,   91]], dtype=torch.int32)

In my case, the result was:
```
[tensor([1, 1, 1, 1, 1, 1, 1, 1]),
 tensor([ 6,  7,  6, 15,  5,  6,  5, 62]),
 tensor([ 48,  34,  83,   7,  32, 221,  22,  43]),
 tensor([  5, 143,  37,  36, 129,  12,  11,  66]),
 tensor([  73, 1258,  279,    8,    6,  555,   41,   10]),
 tensor([  8, 140,   8, 628,  20,  96,  13, 270]),
 tensor([  47,    4,   15,   18,   55,  269,    6, 1287]),
 tensor([ 58,   2,  13, 140, 193, 140, 171, 140])]
```

What's weird here?
1. This is not a tensor, but a list of tensors. Accordingly, when iterating over zero dimension (`batch[i, :]`), we will get not an i-example, but i-tokens for all examples in the batch. This is not a problem, but different from the expected behavior.
2. Only one example ends with `<EOS>` (2), the others are cut off to match its length. And this is a problem.

We would like to padd all examples to the maximum length in the batch.
But at the stage of preparing the example (in the `__getitem__` function), we do not know the batch neighbors!
In order to change the batch merging logic, we need to write our own `collate_fn` function in the DataLoader constructor:

```
def collate_fn(samples):
    # samples -- list of dictionaries samples
    <...>
    return batch
```

**(1 point)** Write a `collate_fn` function that padds _correctly_ rus and eng sequences and merges them into batches, where `batch[i, :]` returns the tokens for the `i` example.

Expected output (for a sequence with left padding):

```
tensor([[   1,   10, 3429,  405,  113,  676,   10, 1031,  140,    4,    2],
        [   0,    1,   57,   18,   23,   19,   61,    7,  140,    4,    2],
        [   0,    0,    0,    1,   16,   17, 1131,  416,  140,    4,    2],
        [   0,    0,    0,    1,   13,  465,   75,  197,  140,    4,    2],
        [   0,    0,    0,    1,    6,  302,   13,  144,  140,    4,    2],
        [   0,    1,    6,   59,  205,  167,    8,   15,  140,    4,    2],
        [   0,    0,    0,    0,    1,    6,   14,  678,  140,    4,    2],
        [   0,    0,    1,    5,   29,   67,    6,   14,  140,    4,    2]])
```

In [349]:
class EngRusCollate:
    def __init__(self, padding_value: int = 0, batch_first: bool = True) -> None:
        self.padding_value = padding_value
        self.batch_first = batch_first
        
    def __call__(self, batch: List[dict]) -> Dict[str, torch.tensor]:
        # getting max length
        data = dict(
                eng=list(),
                rus=list()
            )
        
        for seq in batch:
            data["eng"].append(torch.flip(seq.get("eng"), dims=(0, )))
            data["rus"].append(seq.get("rus"))
        return {k: torch.flip(pad_sequence(v, batch_first=self.batch_first, 
                                           padding_value=self.padding_value), dims=(1, )) \
                if k =="eng" else \
                   pad_sequence(v, batch_first=self.batch_first, 
                                padding_value=self.padding_value).to(torch.int64) for k, v in data.items()}
    
    
trainset = EngRusDataset(pairs=train_pairs)
evalset = EngRusDataset(pairs=eval_pairs)

trainloader = DataLoader(trainset, batch_size=8, shuffle=True, collate_fn=EngRusCollate())
evalloader = DataLoader(evalset, batch_size=8, shuffle=False, collate_fn=EngRusCollate())

print(len(trainloader), len(evalloader))

33667 8417


In [291]:
eng, rus = next(iter(trainloader)).values()

In [292]:
eng

tensor([[   0,    0,    0,    0,    0,    1,    7,   34,  227, 4960,    4,    2],
        [   0,    0,    0,    1,    5,  360,   56,    6,   20,  725,    4,    2],
        [   0,    1,    5,   22,   11,   41,    5,   72,    8,   47,    4,    2],
        [   0,    0,    0,    0,    1,    6,   59,  262,   66,  187,    4,    2],
        [   1,   15,    7,   36,    8,   93,   23,    5,   51,  100,    9,    2],
        [   0,    0,    1,   16,  388,   33, 1471,   85,   94,  303,    4,    2],
        [   1,    5,  139,    6,   74,   19,   73,  233,   44,  123,    4,    2],
        [   0,    0,    1,    6,   60,  126,   21,   12,  330,  370,    4,    2]],
       dtype=torch.int32)

In [360]:
for idx in range(eng.shape[0]):
    sentence = eng[idx, :].tolist()
    print(idx2sentence(seq=sentence, idx2tok=idx2eng))

PAD PAD PAD PAD PAD SOS you are two faced . EOS
PAD PAD PAD SOS i wonder why tom was fired . EOS
PAD SOS i don t think i need to go . EOS
PAD PAD PAD PAD SOS tom has lost all hope . EOS
SOS do you want to see what i ve got ? EOS
PAD PAD SOS it seems my dreams never come true . EOS
SOS i asked tom where he had bought his car . EOS
PAD PAD SOS tom will leave in a few days . EOS


In [293]:
rus

tensor([[   1,   12,    3,    4,    2,    0,    0,    0,    0,    0,    0],
        [   1,  282,   45,   21, 1381,    4,    2,    0,    0,    0,    0],
        [   1,    6,   40,    9,   15,   64,  157,    4,    2,    0,    0],
        [   1,    7,  364, 8131, 2823,    4,    2,    0,    0,    0,    0],
        [   1,  116, 2327,    9,   16,   22,   42,    8,    2,    0,    0],
        [   1,  296,   79, 2885,   59,    6,    3,    4,    2,    0,    0],
        [   1,    5,  295,   16,  313,   63,   18,  177,  178,    4,    2],
        [   1,    7, 2987,  263,  193,  690,    4,    2,    0,    0,    0]],
       dtype=torch.int32)

In [359]:
for idx in range(rus.shape[0]):
    sentence = rus[idx, :].tolist()
    print(idx2sentence(seq=sentence, idx2tok=idx2rus))

SOS ты UNK . EOS PAD PAD PAD PAD PAD PAD
SOS интересно, почему тома уволили . EOS PAD PAD PAD PAD
SOS не думаю, что мне нужно идти . EOS PAD PAD
SOS том потерял всякую надежду . EOS PAD PAD PAD PAD
SOS хочешь посмотреть, что у меня есть ? EOS PAD PAD
SOS похоже, мои мечты никогда не UNK . EOS PAD PAD
SOS я спросил у тома, где он купил машину . EOS
SOS том уедет через несколько днеи . EOS PAD PAD PAD


Now we have the correct data generator, and all we have to do is write the model (encoder and decoder).


### Encoder

The input tensor contains integers and has dimensions `[bs, seq_len]`,

We will pass them through the layer with embeddings and get the tensor `[bs, seq_len, dim]`. Now we have floating point numbers that can be fed to RNN layers as input.



GRU is an RNN with a specific structure:
<img src="https://habrastorage.org/webt/xt/_q/nj/xt_qnjgfjengqoqd4gizkq4j_wk.png">

In the picture, the yellow rectangles are the line layers with the corresponding activation functions.


`nn.RNN` allows you to create and use multi-layer one- and two-way layers as one layer.
All parameters must be specified during creation, and then simply applied during the forward pass.


The order of dimensions is a bit different from the usual in convolutional networks, this is due to the inability to parallel recurrent calculations effectively.


**batch_first=True**

Such an RNN layer expects two tensors as input:
  - input with sizes `[bs, seq_len, dim]`,
  - hidden_state with dimensions `[num_layers * num_directions, bs, hidden_size]`.
 
 
The output is two tensors:
- output `[bs, seq_len, dim]`,
- hidden `[num_layers * num_directions, bs, hidden]`.

We will apply RNN in two ways:
- to the entire sequence, to translate the entire phrase in one language into one vector (EncoderRNN)
- to one tensor and input token to generate a phrase in another language (DecoderRNN)


We will put the entire input sequence into a hidden state vector.

In [184]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size: int, vocab_size: int, layers: int = 1) -> None:
        super().__init__()
        self.layers = layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        # initialize the embeddings with "hidden_size" size for each token in the vocab
        # each token initialized from standard normal distribution N(0, 1)
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=hidden_size)
        
        # initialize RNN model with "layers" num layers of GRU cells. "Bidirectional" argument is not
        # specified -> the model is one directional
        self.rnn = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=layers, 
                          batch_first=True)
        
    def forward(self, input_: torch.tensor, hidden: torch.tensor) -> Tuple[torch.tensor]:
        # getting tensor of shape [batch_size, n_sequences, embedings size]
        # it's X data to feed to GRU cell
        embedded = self.embeddings(input_)
        
        # feed through rnn embedings and hidden
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden
    
    def init_hidden(self, batch_size: int = 1, device: str = None) -> torch.tensor:
        """
        Method initializes the first hidden vector with shape (D∗num_layers, N, H) to feed to first GRU cell.
        D = 2 if bidirectional otherwise 1, N - batch size, H - hidden size.
        All values are equal to 0
        """
        # be aware about dimension! https://pytorch.org/docs/stable/nn.html#torch.nn.GRU
        return torch.zeros(self.layers, batch_size, self.hidden_size, device=device)

# initialize the encoder with hidden size 256
enc = EncoderRNN(hidden_size=256, vocab_size=len(eng2idx))

x = next(iter(trainloader))["eng"]
print(x.shape)

# first hidden vector is a vector of zeroes
# so, initialize this vector
hidden = enc.init_hidden(batch_size=x.shape[0])
output, hidden = enc(x, hidden)
print(output.shape, hidden.shape)

torch.Size([8, 12])
torch.Size([8, 12, 256]) torch.Size([1, 8, 256])


We want the decoder to generate a translation for us -- a sequence of tokens from another language, using the encoder's hidden state vector.

To do this, we will supply hidden and `<SOS>`token to the input.
At each step, the decoder will return hidden and output vector.
Output vector is the probability distribution for the next token (respectively, it has the size of the output language dictionary).

In [296]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size: int, vocab_size: int, layers: int = 1) -> None:
        super().__init__()
        self.layers = layers
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        
        # initialize the embeddings with "hidden_size" size for each token in the vocab
        # each token initialized from standard normal distribution N(0, 1)
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=hidden_size)
        
        # initialize onedirectional RNN of decoder with GRU cells
        self.rnn = nn.GRU(input_size=hidden_size, hidden_size=hidden_size, num_layers=layers, 
                          batch_first=True)
        
        # finally we have to classify our tokens with Linear layer at the end
        self.classifier = nn.Linear(in_features=hidden_size, out_features=vocab_size)
        self.softmax = nn.LogSoftmax(dim=2)
        
    def forward(self, input_: torch.tensor, hidden: torch.tensor) -> Tuple[torch.tensor]:
        embedded = self.embeddings(input_)
        output, hidden = self.rnn(embedded, hidden)
        output = self.softmax(self.classifier(output))
        return output, hidden
    
    def init_hidden(self, batch_size: int = 1, device: str = None) -> torch.tensor:
        """
        Method initializes the first hidden vector with shape (D∗num_layers, N, H) to feed to first GRU cell.
        D = 2 if bidirectional otherwise 1, N - batch size, H - hidden size.
        All values are equal to 0
        """
        # be aware about dimension! https://pytorch.org/docs/stable/nn.html#torch.nn.GRU
        return torch.zeros(self.layers, batch_size, self.hidden_size, device=device)


In [297]:
dec = DecoderRNN(hidden_size=256, vocab_size=len(rus2idx))

Let's get a tensor with tokens of size `[bs, seq_len]` from the data generator and try to iterate over seq_len to generate the next token.

In [298]:
batch = next(iter(trainloader))["rus"] # get batch
bs, seq_len = batch.shape
hidden = dec.init_hidden(batch_size=bs)

for i in range(0, seq_len):
    step = batch[:, i].unsqueeze(1)  # get tokens sample for i-th step 
     # These are the correct tokens (ground truth), we could generate them
     # unsqueeze adds dimension 1 (from [bs] to [bs, 1])
    
    output, hidden = dec(step, hidden)
    print(output.shape, hidden.shape)
    # output -- this is the probability distribution for the next token
    # hidden -- this is the updated hidden state

torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])
torch.Size([8, 1, 10000]) torch.Size([1, 8, 256])


In [336]:
arr = torch.tensor([1, 2, 4], requires_grad=True, dtype=torch.float)

**(6 points)** Fill in a training part and train the encoder and decoder.

1. You need to write getting the next token (integer) from the distribution: a vector of size `len(rus2idx)`. Since we are working in batches, this should be a batchified operation. You have several options for how to do this:
 - take by argmax
 - sample from distribution (torch.multinomial)
 - during training, take tokens from ground truth (and this must be done at least sometimes so that the model converges).
 
2. You need to write a loss calculation. It is convenient to do this at each step: after the `<EOS>` occurs in the example, you do not need to count the loss for it (in the vectorized version, you can multiply the loss for `<PAD>`-tokens by zero - this is called masking). Loss is simply the sum of cross-entropy losses for each step.


In [399]:
logging.basicConfig(
    level=logging.INFO, 
    format='[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
)
LOGGER = logging.getLogger(__name__)


def train_model(model: Tuple[nn.Module], optimizer: torch.optim, dataloader: DataLoader, device: str = None, 
                teacher_forcing_ratio: float = 0.5, verbose_step: int = 100, pad_token: int = 0) -> defaultdict: 
    LOGGER.info("\tTrain loop")
    encoder, decoder = model
    encoder.to(device)
    decoder.to(device)
    
    encoder.train()
    decoder.train()
    logs = defaultdict(list)
    
    # reduction False to use loss masking
    criterion = nn.CrossEntropyLoss(reduction="none")
    
    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        eng = batch['eng'].to(device)
        rus = batch['rus'].to(device)
        
        # rus sequence length and batch size
        batch_size, rus_seq_length = rus.shape
        
        encoder_hidden = encoder.init_hidden(batch_size=batch_size, device=device)
        _, hidden = encoder(eng, encoder_hidden)
        
        # write decoder application and loss calculation.
        # hint: loss must be masked, in case the sequence has already ended.
              
        # the first token for the decoder input is <SOS>
        decoder_input = rus[:, 0].unsqueeze(1)
        
        loss = 0
        for token in range(1, rus_seq_length):
            # forward single decoder GRU cell
            decoder_outputs, hidden = decoder(decoder_input, hidden)
            
            # now we have to decide, which token should we use for the next GRU cell of the decoder:
            # token from target rus sequence or output from the previous GRU cell
            teacher_force = random.random() < teacher_forcing_ratio
            decoder_input = rus[:, token].unsqueeze(1) if teacher_force else decoder_outputs.argmax(2)
            
            curr_loss = criterion(decoder_outputs.view(batch_size, -1), rus[:, token])
            curr_loss = torch.masked_select(curr_loss, rus[:, token] != pad_token).mean()
            loss += curr_loss.mean()
        
        loss = loss / (rus_seq_length - 1)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        loss = loss.cpu().item()
        logs["loss"].append(loss)
        
        if verbose_step is not None and not i % verbose_step:
            LOGGER.info("\tIteration [{}]/[{}] loss: {:.6f}".format(i, len(dataloader), loss))
            
    return logs, (encoder, decoder)

def eval_model(model: Tuple[nn.Module], dataloader: DataLoader, device: str = None, 
               verbose_step: int = 100, pad_token: int = 0):
    LOGGER.info("\tEval loop")
    encoder, decoder = model
    encoder.to(device)
    decoder.to(device)
    
    encoder.train()
    decoder.train()
    logs = defaultdict(list)
    
    # reduction False to use loss masking
    criterion = nn.CrossEntropyLoss(reduction="none")
    
    for i, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        eng = batch['eng'].to(device)
        rus = batch['rus'].to(device)

        # rus sequence length and batch size
        batch_size, rus_seq_length = rus.shape

        encoder_hidden = encoder.init_hidden(batch_size=batch_size, device=device)
        _, hidden = encoder(eng, encoder_hidden)

        # the first token for the decoder input is <SOS>
        decoder_input = rus[:, 0].unsqueeze(1)

        loss = 0
        for token in range(1, rus_seq_length):
            # forward single decoder GRU cell
            decoder_outputs, hidden = decoder(decoder_input, hidden)
            decoder_input = decoder_outputs.argmax(2)

            curr_loss = criterion(decoder_outputs.view(batch_size, -1), rus[:, token])
            curr_loss = torch.masked_select(curr_loss, rus[:, token] != pad_token).mean()
            loss += curr_loss.mean()

        loss = loss / (rus_seq_length - 1)
        loss = loss.cpu().item()
        logs["loss"].append(loss)

        if verbose_step is not None and not i % verbose_step:
            LOGGER.info("\tIteration [{}]/[{}] loss: {:.6f}".format(i, len(dataloader), loss))
                
    return logs


In [400]:
LR = 1e-2
BATCH_SIZE = 128
HIDEN_SIZE = 256
N_EPOCHS = 5
WRITER_PATH = "./runs/"
device = "cuda" if torch.cuda.is_available() else "cpu"

# initialize encoder & decoder
encoder = EncoderRNN(hidden_size=HIDEN_SIZE, vocab_size=len(eng2idx))
decoder = DecoderRNN(hidden_size=HIDEN_SIZE, vocab_size=len(rus2idx))

# initialize optimizer
opt = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=LR)

# initialize writer for tensorboard
writer = SummaryWriter(WRITER_PATH)

# initialize DataLoaders
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=EngRusCollate())
evalloader = DataLoader(evalset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=EngRusCollate())

# initialize the model and train it
model = (encoder, decoder)
LOGGER.info("Iterate over {} epochs".format(N_EPOCHS))
for epoch in range(N_EPOCHS):
    LOGGER.info("Epoch [{}]/[{}]".format(epoch + 1, N_EPOCHS))
    train_logs, model = \
        train_model(model=model, optimizer=opt, dataloader=trainloader, device=device, teacher_forcing_ratio=0.5, 
                    verbose_step=150)
    with torch.no_grad():
        eval_logs = eval_model(model=model, dataloader=evalloader, device=device, verbose_step=150)
    
    train_loss_avg, eval_loss_avg = \
        np.mean(train_logs.get("loss", 0)), np.mean(eval_logs.get("loss", 0))
    
    writer.add_scalar(tag=os.path.join(WRITER_PATH, "Loss", "Train"), 
                      scalar_value=train_loss_avg, global_step=epoch + 1)
    writer.add_scalar(tag=os.path.join(WRITER_PATH, "Eval", "Train"), 
                      scalar_value=eval_loss_avg, global_step=epoch + 1)
    
    LOGGER.info("Train loss avg:\t{:.6f}".format(train_loss_avg))
    LOGGER.info("Eval loss avg:\t{:.6f}".format(eval_loss_avg))    

[2022-12-03 18:13:14,507] {<ipython-input-400-30d0786d864b>:24} INFO - Iterate over 5 epochs
[2022-12-03 18:13:14,508] {<ipython-input-400-30d0786d864b>:26} INFO - Epoch [1]/[5]
[2022-12-03 18:13:14,509] {<ipython-input-399-3ee10b9b7cd5>:10} INFO - 	Train loop


HBox(children=(FloatProgress(value=0.0, max=2105.0), HTML(value='')))

[2022-12-03 18:13:15,099] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [0]/[2105] loss: 9.235302
[2022-12-03 18:13:31,513] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [150]/[2105] loss: 4.333659
[2022-12-03 18:13:47,906] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [300]/[2105] loss: 3.989600
[2022-12-03 18:14:05,299] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [450]/[2105] loss: 4.219264
[2022-12-03 18:14:31,480] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [600]/[2105] loss: 3.749774
[2022-12-03 18:14:52,836] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [750]/[2105] loss: 3.912883
[2022-12-03 18:15:33,419] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [900]/[2105] loss: 4.104872
[2022-12-03 18:16:42,558] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1050]/[2105] loss: 3.895092
[2022-12-03 18:17:54,791] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1200]/[2105] loss: 4.870035
[2022-12-0




HBox(children=(FloatProgress(value=0.0, max=527.0), HTML(value='')))

[2022-12-03 18:26:05,764] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [0]/[527] loss: 4.781900
[2022-12-03 18:26:36,980] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [150]/[527] loss: 6.159809
[2022-12-03 18:27:05,292] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [300]/[527] loss: 4.737787
[2022-12-03 18:27:35,925] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [450]/[527] loss: 4.768664
[2022-12-03 18:27:49,610] {<ipython-input-400-30d0786d864b>:41} INFO - Train loss avg:	4.253991
[2022-12-03 18:27:49,611] {<ipython-input-400-30d0786d864b>:42} INFO - Eval loss avg:	4.952376
[2022-12-03 18:27:49,612] {<ipython-input-400-30d0786d864b>:26} INFO - Epoch [2]/[5]
[2022-12-03 18:27:49,613] {<ipython-input-399-3ee10b9b7cd5>:10} INFO - 	Train loop





HBox(children=(FloatProgress(value=0.0, max=2105.0), HTML(value='')))

[2022-12-03 18:27:50,374] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [0]/[2105] loss: 4.212917
[2022-12-03 18:29:16,902] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [150]/[2105] loss: 4.220631
[2022-12-03 18:30:41,022] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [300]/[2105] loss: 4.123466
[2022-12-03 18:32:04,142] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [450]/[2105] loss: 4.373175
[2022-12-03 18:33:25,878] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [600]/[2105] loss: 3.183547
[2022-12-03 18:34:41,783] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [750]/[2105] loss: 3.580933
[2022-12-03 18:36:03,200] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [900]/[2105] loss: 4.275609
[2022-12-03 18:37:27,693] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1050]/[2105] loss: 4.549936
[2022-12-03 18:38:53,760] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1200]/[2105] loss: 4.319018
[2022-12-0




HBox(children=(FloatProgress(value=0.0, max=527.0), HTML(value='')))

[2022-12-03 18:47:16,205] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [0]/[527] loss: 4.836901
[2022-12-03 18:47:42,474] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [150]/[527] loss: 5.357322
[2022-12-03 18:48:15,495] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [300]/[527] loss: 5.324896
[2022-12-03 18:48:44,114] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [450]/[527] loss: 4.822260
[2022-12-03 18:49:01,500] {<ipython-input-400-30d0786d864b>:41} INFO - Train loss avg:	4.135034
[2022-12-03 18:49:01,500] {<ipython-input-400-30d0786d864b>:42} INFO - Eval loss avg:	5.043538
[2022-12-03 18:49:01,501] {<ipython-input-400-30d0786d864b>:26} INFO - Epoch [3]/[5]
[2022-12-03 18:49:01,501] {<ipython-input-399-3ee10b9b7cd5>:10} INFO - 	Train loop





HBox(children=(FloatProgress(value=0.0, max=2105.0), HTML(value='')))

[2022-12-03 18:49:02,249] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [0]/[2105] loss: 3.967948
[2022-12-03 18:50:26,212] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [150]/[2105] loss: 3.975446
[2022-12-03 18:51:51,377] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [300]/[2105] loss: 3.506905
[2022-12-03 18:53:14,052] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [450]/[2105] loss: 4.195498
[2022-12-03 18:54:40,377] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [600]/[2105] loss: 3.729006
[2022-12-03 18:56:06,580] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [750]/[2105] loss: 3.630001
[2022-12-03 18:57:26,560] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [900]/[2105] loss: 3.535708
[2022-12-03 18:58:48,561] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1050]/[2105] loss: 4.804193
[2022-12-03 19:00:18,761] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1200]/[2105] loss: 3.431057
[2022-12-0




HBox(children=(FloatProgress(value=0.0, max=527.0), HTML(value='')))

[2022-12-03 19:08:48,407] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [0]/[527] loss: 5.418152
[2022-12-03 19:09:14,460] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [150]/[527] loss: 5.640875
[2022-12-03 19:09:44,893] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [300]/[527] loss: 5.529169
[2022-12-03 19:10:16,334] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [450]/[527] loss: 5.150251
[2022-12-03 19:10:30,369] {<ipython-input-400-30d0786d864b>:41} INFO - Train loss avg:	4.065553
[2022-12-03 19:10:30,369] {<ipython-input-400-30d0786d864b>:42} INFO - Eval loss avg:	5.455361
[2022-12-03 19:10:30,370] {<ipython-input-400-30d0786d864b>:26} INFO - Epoch [4]/[5]
[2022-12-03 19:10:30,370] {<ipython-input-399-3ee10b9b7cd5>:10} INFO - 	Train loop





HBox(children=(FloatProgress(value=0.0, max=2105.0), HTML(value='')))

[2022-12-03 19:10:31,055] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [0]/[2105] loss: 4.341008
[2022-12-03 19:11:55,176] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [150]/[2105] loss: 3.420315
[2022-12-03 19:13:24,629] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [300]/[2105] loss: 3.673876
[2022-12-03 19:14:39,461] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [450]/[2105] loss: 3.882543
[2022-12-03 19:16:03,918] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [600]/[2105] loss: 3.937642
[2022-12-03 19:17:35,038] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [750]/[2105] loss: 3.630076
[2022-12-03 19:18:57,440] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [900]/[2105] loss: 3.795650
[2022-12-03 19:20:16,840] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1050]/[2105] loss: 3.844103
[2022-12-03 19:21:44,808] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1200]/[2105] loss: 4.153373
[2022-12-0




HBox(children=(FloatProgress(value=0.0, max=527.0), HTML(value='')))

[2022-12-03 19:29:47,977] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [0]/[527] loss: 4.497373
[2022-12-03 19:30:22,028] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [150]/[527] loss: 5.815016
[2022-12-03 19:30:50,630] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [300]/[527] loss: 4.584311
[2022-12-03 19:31:19,783] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [450]/[527] loss: 4.776179
[2022-12-03 19:31:36,207] {<ipython-input-400-30d0786d864b>:41} INFO - Train loss avg:	4.008509
[2022-12-03 19:31:36,208] {<ipython-input-400-30d0786d864b>:42} INFO - Eval loss avg:	4.964145
[2022-12-03 19:31:36,209] {<ipython-input-400-30d0786d864b>:26} INFO - Epoch [5]/[5]
[2022-12-03 19:31:36,210] {<ipython-input-399-3ee10b9b7cd5>:10} INFO - 	Train loop





HBox(children=(FloatProgress(value=0.0, max=2105.0), HTML(value='')))

[2022-12-03 19:31:36,976] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [0]/[2105] loss: 3.818311
[2022-12-03 19:33:02,620] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [150]/[2105] loss: 4.165160
[2022-12-03 19:34:27,990] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [300]/[2105] loss: 3.947241
[2022-12-03 19:35:45,081] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [450]/[2105] loss: 3.797295
[2022-12-03 19:37:10,302] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [600]/[2105] loss: 3.383263
[2022-12-03 19:38:27,680] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [750]/[2105] loss: 3.795603
[2022-12-03 19:39:56,061] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [900]/[2105] loss: 4.948834
[2022-12-03 19:41:18,444] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1050]/[2105] loss: 4.052854
[2022-12-03 19:42:41,785] {<ipython-input-399-3ee10b9b7cd5>:61} INFO - 	Iteration [1200]/[2105] loss: 3.354173
[2022-12-0




HBox(children=(FloatProgress(value=0.0, max=527.0), HTML(value='')))

[2022-12-03 19:50:54,546] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [0]/[527] loss: 4.628288
[2022-12-03 19:51:26,547] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [150]/[527] loss: 5.936299
[2022-12-03 19:52:00,938] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [300]/[527] loss: 4.528725
[2022-12-03 19:52:26,714] {<ipython-input-399-3ee10b9b7cd5>:107} INFO - 	Iteration [450]/[527] loss: 4.480340
[2022-12-03 19:52:44,657] {<ipython-input-400-30d0786d864b>:41} INFO - Train loss avg:	3.914209
[2022-12-03 19:52:44,658] {<ipython-input-400-30d0786d864b>:42} INFO - Eval loss avg:	4.926310





**(2 points)** Write a translation function with sampling from a distribution with temperature.

In [402]:
encoder, decoder = model
encoder.eval()
decoder.eval()
encoder = encoder.to("cpu")
decoder = decoder.to("cpu")

In [419]:
def evaluate(model: Tuple[nn.Module], sentence: str, eng2idx: Dict[str, int], idx2rus: Dict[str, int], 
             temp: Union[int, float] = 1.0, max_seq_length: int = 20, batch_size: int = 10, 
             sos_token: int = 1) -> None:
    encoder, decoder = model
    
    # encode sentence to token sequence
    encoded = sentence2idx(seq=sentence, tok2idx=eng2idx)
    print(encoded)
    
    output = []
    with torch.no_grad():
        # repeat encoded sentence to batch_size times
        z = torch.LongTensor(encoded).view(1, -1).repeat(batch_size, 1)
        
        # feed forward throught encoder
        encoder_outputs, hidden = encoder(z, encoder.init_hidden(batch_size=batch_size))
        
        # iterate over decoder till you will achive max sequence length
        decoder_input = torch.full(size=(batch_size, 1), fill_value=sos_token)
        for i in range(max_seq_length):
            decoder_outputs, hidden = decoder(decoder_input, hidden)
            decoder_input = F.softmax(decoder_outputs / temp, dim=2)
            decoder_input = decoder_input.argmax(dim=2)
            output.append(decoder_input.numpy())
    
    output = np.concatenate(output, axis=1).T
    for s in output:
        out = idx2sentence(s, idx2rus)
        print(out.replace("PAD", ""))

    
evaluate(model=(encoder, decoder), sentence="What is going on?", eng2idx=eng2idx, idx2rus=idx2rus)

[1, 23, 14, 63, 46, 9, 2]
чем чем чем чем чем чем чем чем чем чем
замечательныи замечательныи замечательныи замечательныи замечательныи замечательныи замечательныи замечательныи замечательныи замечательныи
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
? ? ? ? ? ? ? ? ? ?
EOS EOS EOS EOS EOS EOS EOS EOS EOS EOS
