# Шкарбаненко Михаил Б05-907

# Задача 3. Модель автокодировщика

Провести анализ модели автокодировщика (не вариационного) для выборки Twitter (эмбединги предложений). Требуется сравнить качество востановления предложения в зависимости от:

* размера слоя;
* числа слоев;
* параметра dropout;
* добавления BatchNorm;
* размера словаря;
* токенизатора - дополнительное задание (со звездочкой).

Все выводы должны быть представленны в формате tensorboard (каждый набор параметров, свой график, пример --- второй семинар).

P.S. Выборку можно взять из [семинара 17](https://github.com/andriygav/MachineLearningSeminars/blob/master/sem17/data/dataset.csv.dvc). Пример как использовать DVC для выгрузки данных представлен в [ноутбуке](https://github.com/andriygav/MachineLearningSeminars/blob/master/sem17/main.ipynb). DVC временно не работает, можете взять документы напрямую из [яндекс диска](https://disk.yandex.ru/d/bwUVH8hR1MRNrg).

P.S.S. Рекомендуется использовать предобученый BPE токенизатор для снижения размерности словаря (см. задачу 2).

## 1. Подготовительная часть

### 1.1 Библиотеки

In [1231]:
from copy import deepcopy

import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
from matplotlib import gridspec


from tqdm.notebook import tqdm
import numpy as np
import requests
import torch
import pandas as pd

from sklearn.model_selection import ParameterGrid
from torch.utils.tensorboard import SummaryWriter

from nltk.tokenize import RegexpTokenizer

import torch
import torch.nn as nn
import random
from torch.optim import lr_scheduler

from torch.utils.tensorboard import SummaryWriter

### 1.2 Девайс

In [1232]:
device = torch.device("cpu")
device

device(type='cpu')

### 1.3 Датасет и Токенизация

In [1233]:
dataset = pd.read_csv('twitter.csv')
dataset = dataset[dataset[['tag', 'message']].notnull().all(1)]
dataset.head()

Unnamed: 0,tag,message
0,0.0,is so sad for my APL friend.............
1,0.0,I missed the New Moon trailer...
2,1.0,omg its already 7:30 :O
3,0.0,.. Omgaga. Im sooo im gunna CRy. I've been at...
4,0.0,i think mi bf is cheating on me!!! T_T


In [1234]:
dataset = dataset.sample(10000, random_state=42)
train_mask = np.random.rand(len(dataset), ) < 0.8
dataset_train = dataset[train_mask]
dataset_test = dataset[~train_mask]
print(len(dataset_train), len(dataset_test))

8016 1984


In [1235]:
class Tokenizer(object):
    def __init__(self, word_to_ind, tokenizer):
        self.word_to_ind = word_to_ind
        self.tokenizer = tokenizer
    def __call__(self, sentences, max_length = 10, pad_to_max_length = False):
        tokens = self.tokenizer.tokenize_sents(sentences)
        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))
        tokens = [['[CLS]']+s+['[SEP]'] + ['[PAD]']*(max_length-len(s)) \
                  if len(s) < max_length \
                  else ['[CLS]']+s[:max_length]+['[SEP]'] \
                  for s in tokens ]
        ids = [[self.word_to_ind.get(w, self.word_to_ind['[UNK]']) for w in sent] for sent in tokens]
        return torch.tensor(ids).long()

In [1236]:
word_to_ind = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 3, '[SEP]': 4} #3, 4
ind_to_word = {0: '[PAD]', 1:'[UNK]', 3: '[CLS]', 4: '[SEP]'}
for sent in tqdm(dataset_train.values[:, 1]):
    for word in RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+').tokenize(sent):
        if word not in word_to_ind:
            word_to_ind[word] = word_to_ind.__len__()
            ind_to_word[word_to_ind.__len__()] = word
print(len(word_to_ind), len(ind_to_word))

  0%|          | 0/8016 [00:00<?, ?it/s]

18801 18801


In [1237]:
tokenizer = Tokenizer(word_to_ind, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))
train_data_sent = tokenizer(dataset_train.values[:, 1])
test_data_sent = tokenizer(dataset_test.values[:, 1])

In [1238]:
dataset_train_pt = torch.utils.data.TensorDataset(
    train_data_sent, train_data_sent)
dataset_test_pt = torch.utils.data.TensorDataset(
    test_data_sent, test_data_sent)

batch_size = 64
train_dataloader = torch.utils.data.DataLoader(dataset_train_pt, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(dataset_test_pt, batch_size=batch_size)

### 1.4 Модель

In [1239]:
def print_sentence(sent):
    result = ''
    for word in sent:
        result += ind_to_word[word] + ' '

    return result

def test_model(model, dataloader):
    batch = next(iter(dataloader))
    input, truth = batch[0].to(device), batch[1].to(device)
    model.eval()
    with torch.no_grad():
        out = model(input, truth)
        out, truth = torch.argmax(out.detach().cpu(), 2).numpy(), truth.detach().cpu().numpy()
        result = ''
        for i in range(min(out.shape[0], 8)):
            result += 'Result: '+print_sentence(out[i])+', Truth: '+ print_sentence(truth[i])+'\n'
    
    return result

In [1240]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, bidirectional):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout, bidirectional=bidirectional)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):        
        embedded = self.dropout(self.embedding(src))
        
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout, bidirectional):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout, bidirectional=bidirectional)
        self.fc_out = nn.Linear(hid_dim*(2 if self.bidirectional else 1), output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))        
        return prediction, hidden, cell

class Autoencoder(nn.Module):
    def __init__(self, vocabulary_size, emb_dim, hid_dim, n_layers, dropout, bidirectional, device):
        super().__init__()
    
        self.device = device

        self.encoder = Encoder(input_dim=vocabulary_size, emb_dim=emb_dim, 
                               hid_dim=hid_dim, n_layers=n_layers, 
                               dropout=dropout, bidirectional=bidirectional).to(device)
        self.decoder = Decoder(output_dim=vocabulary_size, emb_dim=emb_dim,
                               hid_dim=hid_dim, n_layers=n_layers, 
                               dropout=dropout, bidirectional=bidirectional).to(device)
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0,:]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1
        return outputs

### 1.5 Обучение

In [1242]:
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    model.train()
    model.zero_grad()
    
    input, target = batch_of_x.to(device), batch_of_y.to(device)
    output = model(src=input, trg=target)

    loss = loss_function(output.transpose(1, 2), target)    
    loss.backward()
    optimizer.step()
    
    return loss.cpu().item()

def train_epoch(train_generator, model, loss_function, optimizer):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        local_loss = train_on_batch(
            model, batch_of_x, batch_of_y, optimizer, loss_function)
        train_generator.set_postfix({'train batch loss': local_loss})

        epoch_loss += local_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    
    return epoch_loss/total

def trainer(count_of_epoch, 
            model,
            train_dataloader,
            test_dataloader,
            loss_function,
            optimizer, scheduler=None, writer=None):
    
    iterations = tqdm(range(count_of_epoch))
    if writer is not None:
        writer.add_scalar('Loss/test', test(model, test_dataloader, loss_function), 0)
        writer.add_text('text', test_model(model, test_dataloader), 0)

    for it in iterations:
        optima = optimizer

        generator = tqdm(train_dataloader, leave=False)
        
        epoch_loss = train_epoch(
            train_generator = generator, model = model, 
            loss_function = loss_function, 
            optimizer = optima)
        
        if writer is not None:
            writer.add_scalar('Loss/train', epoch_loss, it+1)
            writer.add_scalar('Loss/test', test(model, test_dataloader, loss_function), it+1)
            writer.add_text('text', test_model(model, test_dataloader), it+1)

        if scheduler is not None:
            scheduler.step()

        iterations.set_postfix({'train epoch loss': epoch_loss})

def test(model, test_dataloader, loss_function):
    model.eval()
    with torch.no_grad():
        epoch_loss = 0
        total = 0
        for batch_of_x, batch_of_y in tqdm(test_dataloader, leave=False):
            input, target = batch_of_x.to(device), batch_of_y.to(device)
            output = model(src=input, trg=target)

            loss = loss_function(output.transpose(1, 2), target).cpu().item()
            epoch_loss += loss*len(batch_of_x)
            total += len(batch_of_x)
        
    return epoch_loss/total 

## 2. Эксперименты

In [1243]:
grid = ParameterGrid({
    'vocabulary_size': [len(word_to_ind)], 
    'emb_dim': [64, 128], 
    'hid_dim': [64, 128], 
    'n_layers': [1, 2], 
    'dropout': [0.0, 0.1], 
    'bidirectional': [True, False]
})

In [1244]:
scores = dict()

for item in tqdm(grid):
    print(str(item))

    writer = SummaryWriter('logs/'+str(item))

    model = Autoencoder(**item, device=device)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=word_to_ind['[PAD]'])
    scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

    trainer(count_of_epoch = 1, model = model, 
            train_dataloader = train_dataloader, 
            test_dataloader=test_dataloader,
            loss_function = loss_function, optimizer = optimizer, 
            scheduler=scheduler, writer=writer)
    
    loss = test(model, test_dataloader, loss_function)
    scores[str(item)] = test(model, test_dataloader, loss_function) 
    writer.add_hparams(item, {'hparam/Test loss': loss})
    writer.flush()

  0%|          | 0/32 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 64, 'hid_dim': 64, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 64, 'hid_dim': 64, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 64, 'hid_dim': 128, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 64, 'hid_dim': 128, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 128, 'hid_dim': 64, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 128, 'hid_dim': 64, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.0, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.1, 'emb_dim': 64, 'hid_dim': 64, 'n_layers': 1, 'vocabulary_size': 18801}




  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.1, 'emb_dim': 64, 'hid_dim': 64, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.1, 'emb_dim': 64, 'hid_dim': 128, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.1, 'emb_dim': 64, 'hid_dim': 128, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.1, 'emb_dim': 128, 'hid_dim': 64, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.1, 'emb_dim': 128, 'hid_dim': 64, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.1, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': True, 'dropout': 0.1, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 64, 'hid_dim': 64, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 64, 'hid_dim': 64, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 64, 'hid_dim': 128, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 64, 'hid_dim': 128, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 128, 'hid_dim': 64, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 128, 'hid_dim': 64, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.0, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.1, 'emb_dim': 64, 'hid_dim': 64, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.1, 'emb_dim': 64, 'hid_dim': 64, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.1, 'emb_dim': 64, 'hid_dim': 128, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.1, 'emb_dim': 64, 'hid_dim': 128, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.1, 'emb_dim': 128, 'hid_dim': 64, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.1, 'emb_dim': 128, 'hid_dim': 64, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.1, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 1, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

{'bidirectional': False, 'dropout': 0.1, 'emb_dim': 128, 'hid_dim': 128, 'n_layers': 2, 'vocabulary_size': 18801}


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

## 3. Итоги

Эксперименты получились не очень удачными. До конца не удалось реализовать оценку качества и архитекуру модели. Судя по полученным лосам архитектура модели не является оптимальным, так как cross-entropy loss не опускается ниже 6. Сделать выводы не представляется возможным.