# Шкарбаненко Михаил,  Б05-907

# Задача 2. Анализ модели LSTM

Провести анализ качества аппроксимации выборки NERUS (предсказание POS tag для токенов) моделью LSTM в зависимости от:

* размера слоя;

* числа слоев;

* параметра dropout;

* добавления BatchNorm;

* размера словаря;

* токенизатора - дополнительное задание (со звездочкой).

## 1. Подготовительная часть

### 1.1 Библиотеки

In [85]:
! pip install nerus



In [86]:
import warnings
warnings.filterwarnings("ignore")

In [87]:
from copy import deepcopy

import matplotlib.pyplot as plt
from matplotlib.image import imread
from mpl_toolkits import mplot3d
from matplotlib import gridspec
from PIL import Image
import io
import os
from urllib.request import urlopen
from skimage.segmentation import mark_boundaries
from nltk.tokenize import RegexpTokenizer

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import requests
from scipy.stats import norm
import torch

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

from torchvision import datasets, transforms
from nerus import load_nerus

### 1.2 Девайс

In [88]:
device = torch.device("cpu")

### 1.3 Токенизатор

Используется биективная токенизация слов и тэгов в подмножество натуральных чисел.

In [89]:
docs = load_nerus('nerus_lenta.conllu.gz')
next(docs)

NerusDoc(
    id='0',
    sents=[NerusSent(
         id='0_0',
         text='Вице-премьер по социальным вопросам Татьяна Голикова рассказала, в каких регионах России зафиксирована наиболее высокая смертность от рака, сообщает РИА Новости.',
         tokens=[NerusToken(
              id='1',
              text='Вице-премьер',
              pos='NOUN',
              feats={'Animacy': 'Anim',
               'Case': 'Nom',
               'Gender': 'Masc',
               'Number': 'Sing'},
              head_id='7',
              rel='nsubj',
              tag='O'
          ),
          NerusToken(
              id='2',
              text='по',
              pos='ADP',
              feats={},
              head_id='4',
              rel='case',
              tag='O'
          ),
          NerusToken(
              id='3',
              text='социальным',
              pos='ADJ',
              feats={'Case': 'Dat',
               'Degree': 'Pos',
               'Number': 'Plur'},
          

In [90]:
def get_data_from_docs(size):

    docs = load_nerus('nerus_lenta.conllu.gz')
    sents, tags = [], []
    for i in range(size):
        doc = next(docs)
        for sent in doc.sents:
            sents.append([token.text for token in sent.tokens])
            tags.append([token.pos for token in sent.tokens])
                      
    return sents, tags

In [91]:
class Tokenizer(object):
    
    def __init__(self, sents, tags):

        word_tokens = {'[PAD]': 0, '[UNK]': 1}
        for sent in sents:
            for word in sent:
                if word not in word_tokens:
                    word_tokens[word] = word_tokens.__len__()

        tag_tokens = {'[PAD]': 0, '[UNK]': 1}
        for tags in tags:
            for tag in tags:
                if tag not in tag_tokens:
                    tag_tokens[tag] = tag_tokens.__len__()
        
        self.word_tokens = word_tokens
        self.tag_tokens = tag_tokens

                
    def __call__(self, sents, tags):

        tokenized_sents = [torch.tensor([self.word_tokens.get(word, self.word_tokens['[UNK]']) for word in sent]) for sent in sents]
        tokenized_tags = [torch.tensor([self.tag_tokens.get(tag, self.tag_tokens['[UNK]']) for tag in tags_]) for tags_ in tags]

        return pad_sequence(tokenized_sents, batch_first=True, padding_value=self.word_tokens['[PAD]']), \
               pad_sequence(tokenized_tags, batch_first=True, padding_value=self.tag_tokens['[PAD]'])

### 1.4 Датасет

In [92]:
class NerusDataset(Dataset):

    def __init__(self, tokenized_sents, tokenized_tags):
        
        super().__init__()

        self.tokenized_sents = tokenized_sents
        self.tokenized_tags = tokenized_tags

    def __len__(self):

        return len(self.tokenized_sents)
                
    def __getitem__(self, idx):
        
        return torch.tensor(self.tokenized_sents[idx]), torch.tensor(self.tokenized_tags[idx])

In [93]:
def get_data_loaders_and_parameters(size):

        sents, tags = get_data_from_docs(size)
        tokenizer = Tokenizer(sents, tags)
        sents_train, sents_test, tags_train, tags_test = train_test_split(sents, tags, test_size=0.1, random_state=17)

        tokenized_sents_train, tokenized_tags_train = tokenizer(sents_train, tags_train)
        tokenized_sents_test, tokenized_tags_test = tokenizer(sents_test, tags_test)

        train_dataset = NerusDataset(tokenized_sents_train, tokenized_tags_train) 
        test_dataset = NerusDataset(tokenized_sents_test, tokenized_tags_test) 

        train_dataloader = DataLoader(train_dataset, batch_size=64)
        test_dataloader = DataLoader(test_dataset, batch_size=64)

        vocab_dim, num_classes = len(tokenizer.word_tokens), len(tokenizer.tag_tokens)

        return train_dataloader, test_dataloader, vocab_dim, num_classes

### 1.5 Модель

#### 1.5.1 Архитектура

In [94]:
class LSTMClassifier(torch.nn.Module):

    @property
    def device(self):

        return next(self.parameters()).device

    def __init__(self, vocab_dim, emb_dim, hidden_dim, num_layers, dropout, output_dim):

        super(LSTMClassifier, self).__init__()

        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)
        self.encoder = torch.nn.LSTM(emb_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout)
        self.linear = torch.nn.Linear(hidden_dim, output_dim)

    def forward(self, input):

        embeded_input = self.embedding(input)
        encoded_input, _ = self.encoder(embeded_input)

        return self.linear(encoded_input)

#### 1.5.2 Логгер

In [95]:
class Callback():

    def __init__(self, writer, loss_function, delimeter=50):

        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function

    def forward(self, model, dataloader_test, loss):

        self.step += 1
        self.writer.add_scalar('LOSS/train', loss, self.step)
        
        if self.step % self.delimeter == 0:
            
            pred = []
            real = []
            test_loss = 0
            
            model.eval()
            with torch.no_grad():
                for x_batch, y_batch in dataloader_test:
                    x_batch = x_batch.to(model.device)
                    y_batch = y_batch.to(model.device)
                    output = model(x_batch)
                    test_loss += self.loss_function(output.transpose(-1, 1),
                                               y_batch.transpose(-1, 1)).cpu().item() \
                                 * len(x_batch)
              
                    pred.extend(torch.argmax(output, dim=2).cpu().numpy().flatten())
                    real.extend(y_batch.cpu().numpy().flatten())
            
            self.writer.add_scalar('LOSS/test', test_loss, self.step)
            self.writer.add_text('REPORT/test',
                                 str(classification_report(real, pred, labels=range(1, model.linear.out_features))),
                                 self.step)
          
    def __call__(self, model, dataloader_test, loss):
        
        return self.forward(model, dataloader_test, loss)

#### 1.5.3 Обучение

In [96]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    
    model.train()
    optimizer.zero_grad()
    output = model(x_batch.to(model.device))
    loss = loss_function(output.transpose(-1, 1), y_batch.to(model.device).transpose(-1, 1))
    loss.backward()
    optimizer.step()

    return loss.cpu().item()

def train_epoch(model, loss_function, optimizer, dataloader_train, dataloader_test, callback=None):

    epoch_loss = 0
    total = 0.1
    for it, (batch_of_x, batch_of_y) in tqdm(enumerate(dataloader_train), leave=False, total=len(dataloader_train)):
        pass
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        if callback is not None:
            callback(model, dataloader_test, batch_loss)
        epoch_loss += batch_loss * len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

def trainer(count_of_epoch,
            model,
            loss_function,
            optimizer,
            dataloader_train,
            dataloader_test,
            lr=0.001,
            callback=None):

    optima = optimizer(model.parameters(), lr=lr)
    
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    
    for it in iterations:
        epoch_loss = train_epoch(model=model, 
                    loss_function=loss_function, 
                    optimizer=optima, 
                    dataloader_train=dataloader_train,
                    dataloader_test=dataloader_test,
                    callback=callback)
        
        iterations.set_postfix({'train epoch loss': epoch_loss})

## 2. Анализ модели

### Базовая конфигурация

In [97]:
def make_experiment(log_dir, num_docs=3000, emb_dim=50, hidden_dim=10, num_layers=2, dropout=0.0):
        
        train_dataloader, test_dataloader, vocab_dim, output_dim = get_data_loaders_and_parameters(num_docs)
        model = LSTMClassifier(vocab_dim, emb_dim, hidden_dim, num_layers, dropout, output_dim).to(device)
        loss_function = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam
        writer = SummaryWriter(log_dir=log_dir)
        call = Callback(writer, loss_function)
        trainer(count_of_epoch=3,
                model=model,
                loss_function=loss_function,
                optimizer=optimizer,
                dataloader_train=train_dataloader,
                dataloader_test=test_dataloader,
                callback=call)

In [98]:
num_docs = 3000
train_dataloader, test_dataloader, vocab_dim, output_dim = get_data_loaders_and_parameters(num_docs)
model_conf = {
    'vocab_dim': vocab_dim,
    'emb_dim': 50,
    'hidden_dim': 10,
    'num_layers': 2,
    'dropout': 0.0,
    'output_dim': output_dim
}
model = LSTMClassifier(**model_conf).to(device)

### 2.0 Перфоманс до и после обучения

In [99]:
def quality_report(model, dataloader_test):
    pred = []
    real = []
    model.eval()
    with torch.no_grad():
        for x_batch, y_batch in dataloader_test:
            x_batch = x_batch.to(model.device)
            y_batch = y_batch.to(model.device)
        
            output = model(x_batch)
    
            pred.extend(torch.argmax(output, dim=2).cpu().numpy().flatten())
            real.extend(y_batch.cpu().numpy().flatten())

    print(classification_report(real, pred, labels=range(1, model.linear.out_features)))

#### До

In [100]:
quality_report(model, test_dataloader)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00     17054
           3       0.12      0.00      0.00      7188
           4       0.00      0.00      0.00      5671
           5       0.00      0.00      0.00      4756
           6       0.00      0.00      0.00      7368
           7       0.00      0.00      0.00     11386
           8       0.00      0.00      0.00      1018
           9       0.00      0.00      0.00      1727
          10       0.00      0.00      0.00      2287
          11       0.00      0.00      0.00      1610
          12       0.00      0.00      0.00      1055
          13       0.00      0.00      0.00      1232
          14       0.00      0.00      0.00       848
          15       0.00      0.00      0.00       405
          16       0.00      0.00      0.00       571
          17       0.00      0.00      0.00        19
          18       0.00    

#### После

In [101]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam
writer = SummaryWriter(log_dir='task 2 logs/exp 2.0 - basic conf')
call = Callback(writer, loss_function)
trainer(count_of_epoch=3,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        dataloader_train=train_dataloader,
        dataloader_test=test_dataloader,
        callback=call)

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

In [102]:
quality_report(model, test_dataloader)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
           2       0.42      1.00      0.59     17054
           3       0.93      0.97      0.95      7188
           4       0.00      0.00      0.00      5671
           5       0.00      0.00      0.00      4756
           6       0.02      0.01      0.02      7368
           7       0.95      0.99      0.97     11386
           8       0.00      0.00      0.00      1018
           9       0.00      0.00      0.00      1727
          10       0.00      0.00      0.00      2287
          11       0.00      0.00      0.00      1610
          12       0.00      0.00      0.00      1055
          13       0.00      0.00      0.00      1232
          14       0.00      0.00      0.00       848
          15       0.00      0.00      0.00       405
          16       0.00      0.00      0.00       571
          17       0.00      0.00      0.00        19
          18       0.00    

### 2.1 Размер скрытого слоя в LSTM

In [103]:
for hidden_dim in (20, 30, 40):
    log_dir = 'task 2 logs/exp 2.1 - hidden_dim/hidden_dim = ' + str(hidden_dim)
    make_experiment(log_dir, hidden_dim=hidden_dim)

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

### 2.2 Размерность пространства эмбеддинга

In [104]:
for emb_dim in (100, 200, 300):
    log_dir = 'task 2 logs/exp 2.2 - emb_dim/emb_dim = ' + str(emb_dim)
    make_experiment(log_dir, emb_dim=emb_dim)

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

### 2.3 Число слоев

In [105]:
for num_layers in (3, 4, 5):
    log_dir = 'task 2 logs/exp 2.3 - num_layers/num_layers = ' + str(num_layers)
    make_experiment(log_dir, num_layers=num_layers)

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

### 2.4 Дропаут

In [106]:
for dropout in (0.1, 0.5):
    log_dir = 'task 2 logs/exp 2.4 - dropout/dropout = ' + str(dropout)
    make_experiment(log_dir, dropout=dropout)

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

### 2.5 Размер словаря

Исследуется зависимость не от размера словаря напрямую, а от параметра сильно коррелирующего с размером словаря - числа документов в датасете.

In [107]:
for num_docs in (6000, 9000):
    log_dir = 'task 2 logs/exp 2.5 - num_docs/num_docs = ' + str(num_docs)
    make_experiment(log_dir, num_docs=num_docs)

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/999 [00:00<?, ?it/s]

  0%|          | 0/999 [00:00<?, ?it/s]

  0%|          | 0/999 [00:00<?, ?it/s]

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

  0%|          | 0/1508 [00:00<?, ?it/s]

## 3. Итоги

* Скрытый слой LSTM

Значительный прирост качества при увеличении размера вектора короткой памяти: HiddenSize = {10, 20, 30, 40} -> WeightedF1 = {0.43, 0.61, 0.74, 0.78}. Ожидаемый результат.

* Эмбеддинг

 Незначительный прирост качества при увеличении размерности пространства эмбеддинга в несколько раз: EmbeddingDim = {50, 100, 200, 300} -> WeightedF1 = {0.43, 0.45, 0.46, 0.51}. Результат не вполне предсказуем. Скорее всего изначальной размерности эмбеддинг векторов равной 50 уже достаточно, чтобы эффективно сжимать датасет небольшого размера.

* Слои LSTM

Падение качества при переходе от двух слоев к трем: NumLayers = {2, 3, 4, 5} -> WeightedF1 = {0.43, 0.30, 0.31, 0.30}. Результат не вполне предсказуемый, но объяснимый. Запоминание информациии о слишком длинных последовательностях при классификации частей речи скорее вредно чем полезно.

* Dropout

Падение качества при добавлении ненулевой вероятности дропаута. Результат не вполне предсказуемый, но объяснимый. Dropout = {0.0, 0.1, 0.5} -> WeightedF1 = {0.43, 0.32, 0.30}. Добавление дропаут слоя в сети небольшого размера часто приводит к понижению качества, так как нейронов слишком мало для замещения друг друга.

* Словарь

Значительный прирост качества при увеличении размера датасета (количества документов) в несколько раз и как следствие размера словаря: NumDocs = {3000, 6000, 9000} -> WeightedF1 = {0.43, 0.72, 0.83}. Ожидаемый результат.

In [108]:
%tensorboard --logdir 'task 2 logs/'

UsageError: Line magic function `%tensorboard` not found.
