In [1]:
from copy import deepcopy

import matplotlib.pyplot as plt
from matplotlib.image import imread
from mpl_toolkits import mplot3d
from matplotlib import gridspec
from PIL import Image
import io
from urllib.request import urlopen
from lime import lime_image
from skimage.segmentation import mark_boundaries

from tqdm.notebook import tqdm
import numpy as np
import requests
import torch

from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets, transforms
import pandas as pd
from nerus import load_nerus
from pathlib import Path


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%load_ext tensorboard
%tensorboard --logdir experiment/

In [3]:
NERUS_PATH = "nerus_lenta.conllu.gz"
N_DOCS = 1000
dataset_words = []
dataset_tags = []
ctr = 0
for doc in tqdm(load_nerus(NERUS_PATH), total=N_DOCS):
    if ctr == N_DOCS:
        break
    for sent in doc.sents:
        dataset_words.append([x.text for x in sent.tokens])
        dataset_tags.append([x.pos for x in sent.tokens])
    ctr += 1
    


train_words, train_tags = dataset_words[:int(len(dataset_words) * 0.9)], dataset_tags[:int(len(dataset_words) * 0.9)]
# val_s, val_tags = dataset_words[int(len(dataset_words) * 0.75):int(len(dataset_words) * 0.85)], dataset_tags[int(len(dataset_words) * 0.75):int(len(dataset_words) * 0.85)]
test_words, test_tags = dataset_words[int(len(dataset_words) * 0.9):], dataset_tags[int(len(dataset_words) * 0.9):]
print('Train size:', len(train_words))
# print('Val size:', len(val_sents))
print('Test size:', len(test_words))

  0%|          | 0/1000 [00:00<?, ?it/s]

Train size: 10480
Test size: 1165


In [4]:
word2idx = {'<PAD>':0, '<UNK>': 1, '<START>': 2, '<FINISH>': 3}
idx2char = {0: '<PAD>', 1: '<UNK>', 2: '<START>', 3: '<FINISH>'}
for words in train_words:
    for word in words:
        word2idx[word] = len(word2idx)
        idx2char[word2idx[word]] = word

tag2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2}
for tags in train_tags:
    for tag in tags:
        tag2idx[tag] = tag2idx.get(tag, tag2idx.__len__())

In [5]:
class callback():
    def __init__(self, writer, dataset, loss_function, delimeter = 10, batch_size=64, exp_number = 0, description = ""):
        self.step = 0
        self.writer = writer
        self.delimeter = delimeter
        self.loss_function = loss_function
        self.batch_size = batch_size
        self.exp_number = exp_number
        self.description = description
        self.dataset = dataset

    def forward(self, model, loss):
        self.step += 1
        self.writer.add_scalar(f'LOSS/train{self.exp_number}', loss, self.step)
        
        if self.step % self.delimeter == 0:
            
            loss, percent = check_quality(model, self.dataset, self.loss_function)
            
            self.writer.add_scalar(f'LOSS/test{self.exp_number}', loss, self.step)
            self.writer.add_text(f'REPORT/test{self.exp_number}', f'score {percent}, {self.description}', self.step)
          
    def __call__(self, model, loss):
        return self.forward(model, loss)

def check_quality(model, test, loss_function):
    loss = 0
    cnt = 0
    success = 0
    encoder, decoder = model
    generator = batch_generator(test, 64)
    
    # with open("log", "w") as out:
    try:
        for it, (batch_of_x, batch_of_y) in enumerate(generator):
            d, h, c = encoder(batch_of_x.to(encoder.device))
            output = decoder(
                batch_of_y.to(decoder.device), 
            h=h.to(decoder.device)[:, -decoder.num_layers:, :],
            c=c.to(decoder.device)[:, -decoder.num_layers:, :])
            
            pred = torch.argmax(output, dim=-1).cpu().numpy()
            real = batch_of_y.cpu().numpy()
            
            for i in range(np.shape(real)[0]):
                for j in range(len(real[i])):
                    # out.write(f"{i}, {j}, cnt {cnt}, success {success}")
                    if real[i][j] == 0:
                        break
                    elif real[i][j] == pred[i][j]:
                        success += 1
                    cnt += 1

            loss += loss_function(output[:, :-1, :].transpose(1, 2), batch_of_y.to(decoder.device)[:, 1:])
    except Exception as inst:
        print(type(inst))

    loss /= 64
    percent = 0
    if cnt > 0:
        percent = success / cnt * 100
    return loss, percent

def batch_generator(dataset, batch_size=64, shuffle=False):
    sents, sents_tag = dataset  #tokens, tags
    
    PAD = word2idx['<PAD>']
    n_samples = len(sents)

    n_batches = n_samples // batch_size
    if n_samples % batch_size != 0:
        n_batches += 1
        
    # For each k yield pair x and y
    for k in range(n_batches):
# указываем текущии размер батча
        this_batch_size = batch_size
    
# если мы выдаем последний батч, то его нужно обрезать
        if k == n_batches - 1:
            if n_samples % batch_size > 0:
                this_batch_size = n_samples % batch_size
                
        this_sents = sents[k * batch_size : k * batch_size + this_batch_size]
        this_tags = sents_tag[k * batch_size : k * batch_size + this_batch_size]
        
        token_words = [
                       [word2idx.get(word, 0) for word in sent]\
                       for sent in this_sents]
        token_tags = [
                    #    [tag2idx.get('<START>', 0)]\
                       [tag2idx.get(tag, 0) for tag in tags]\
                    #    + [tag2idx.get('<FINISH>', 0)]\
                       for tags in this_tags]

        List_of_length_x = [len(sent) for sent in token_words]
        length_of_sentence_x = max(List_of_length_x)

        x_arr = np.ones(shape=[this_batch_size, length_of_sentence_x])*PAD
        y_arr = np.ones(shape=[this_batch_size, length_of_sentence_x])*PAD

        for i in range(this_batch_size):
            x_arr[i, : len(token_words[i])] = token_words[i]
            y_arr[i, : len(token_words[i])] = token_tags[i]

        x = torch.LongTensor(x_arr)
        y = torch.LongTensor(y_arr)
        # lengths = torch.LongTensor(List_of_length_x)

        yield x, y
        
def train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function):
    encoder, decoder = model
    encoder.train()
    decoder.train()
    encoder.zero_grad()
    decoder.zero_grad()
    
    # print("AAAAAAAAAAAAAAAAAAAa")
    d, h, c = encoder(batch_of_x.to(encoder.device))
    output = decoder(
        batch_of_y.to(decoder.device), 
        h=h.to(decoder.device)[:, -decoder.num_layers:, :], 
        c=c.to(decoder.device)[:, -decoder.num_layers:, :])

    loss = loss_function(output[:, :-1, :].transpose(1, 2), batch_of_y.to(decoder.device)[:, 1:])
    
    loss.backward()
    optimizer.step()
    
    return loss.cpu().item()

def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        # print('QQQ')
        local_loss = train_on_batch(
            model, batch_of_x, batch_of_y, optimizer, loss_function)
        train_generator.set_postfix({'train batch loss' : local_loss})
        if callback is not None:
            callback(model, local_loss)

        epoch_loss += local_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss / total
def trainer(count_of_epoch, 
            batch_size,
            model,
            dataset,
            loss_function,
            optimizer,
            callback):
    iterations = tqdm(range(count_of_epoch))

    for it in iterations:
        optima = optimizer

        number_of_batch = len(dataset[0]) // batch_size + (len(dataset[0]) % batch_size > 0)
        generator = tqdm(
            batch_generator(dataset, batch_size), 
            leave = False, total = number_of_batch)
            
        epoch_loss = train_epoch(
            train_generator = generator, model = model, 
            loss_function = loss_function, 
            optimizer = optima,
            callback = callback)

        iterations.set_postfix({'train epoch loss': epoch_loss})



In [19]:

class Encoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
        
    def __init__(self,
                 vocab_dim,
                 emb_dim = 64,
                 hidden_dim = 128,
                 num_layers = 1,
                 bidirectional = False,
                 p=0,
                 is_batch_norm = False):
        super(Encoder, self).__init__()
        
        self.num_direction = int(bidirectional + 1)
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.is_batch_norm = is_batch_norm
        self.batch_norm = torch.nn.BatchNorm1d(emb_dim)
        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)
        self.dropout = torch.nn.Dropout(p)
        self.encoder = torch.nn.LSTM(
            emb_dim, hidden_dim, num_layers, bidirectional = bidirectional)

        
    def forward(self, input):
        if self.is_batch_norm:
            input = self.embedding(input)
            # print("before",input.size())
            input = torch.transpose(input, 1, 2)
            input = self.batch_norm(input)
            input = torch.transpose(input, 1, 2)

            # print("after",input.size())
            input = self.dropout(input)
            # input = self.dropout(self.batch_norm(self.embedding(input)))
        else:
            input = self.dropout(self.embedding(input))
            
        input = torch.transpose(input, 0, 1)
        # input = torch.transpose(input, 0, 1)
        d, (h, c) = self.encoder(input)
        return d, torch.transpose(h, 0, 1) , torch.transpose(c, 0, 1)
    
    
    
class Decoder(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self,
                 vocab_dim,
                 output_dim,
                 emb_dim = 10, 
                 hidden_dim = 10,
                 num_layers = 3,
                 bidirectional = False,
                 p = 0,
                 is_batch_norm = False):
        super(Decoder, self).__init__()
        
        self.num_direction = int(bidirectional + 1)
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.dropout = torch.nn.Dropout(p)
        self.embedding = torch.nn.Embedding(vocab_dim, self.emb_dim)

        self.decoder = torch.nn.LSTM(
            emb_dim, hidden_dim, num_layers, bidirectional = bidirectional)

        self.linear = torch.nn.Linear(
            self.num_direction*hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(p)
        self.is_batch_norm = is_batch_norm
        self.batch_norm =torch.nn.BatchNorm1d(self.num_direction*hidden_dim)

    def forward(self, real=None, h = None, c = None, max_len = 50):
        batch_size = 1
        if h is not None:
            batch_size = h.shape[0]
        if c is not None:
            batch_size = c.shape[0]
        if real is not None:
            batch_size = real.shape[0]


        if real is not None:
            input = self.dropout(self.embedding(real))

            if h is None:
                h = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )
            if c is None:
                c = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )

            input = torch.transpose(input, 0, 1)
            h = torch.transpose(h, 0, 1)
            c = torch.transpose(c, 0, 1)
            d, _ = self.decoder(input, (h, c))
            if self.is_batch_norm:
                # print("before", d.size())
                d = torch.transpose(d, 1, 2)
                d = self.batch_norm(d)
                d = torch.transpose(d, 1, 2)
                # print("after", d.size())
            answers = self.linear(d)
        else:
            input = self.embedding(
                torch.tensor(
                    [[word2idx['<START>']] for _ in range(
                        batch_size)]).long().to(
                        self.device
                    )
                )

            if h is None:
                h = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )
            if c is None:
                c = torch.randn(
                    (batch_size, self.num_layers, self.num_direction*self.hidden_dim)).to(
                        self.device
                    )

            input = torch.transpose(input, 0, 1)
            h = torch.transpose(h, 0, 1)
            c = torch.transpose(c, 0, 1)

            answers = torch.zeros(
                (max_len, input.shape[1], self.output_dim)).to(
                    self.device)
                
            for i in range(max_len):
                d, (h, c) = self.decoder(input, (h, c))
                answers[i, :, :] = self.linear(d)[0]
                input = self.embedding(
                    torch.argmax(answers[i:i+1, :, :], dim=-1))

        return torch.transpose(answers, 0, 1)

In [7]:
def experiment(model, 
               exp_num, count_of_epoch = 1, description = ""): 
    
    writer = SummaryWriter(log_dir = 'experiment')
    loss_function = torch.nn.CrossEntropyLoss(ignore_index=word2idx['<PAD>'])
    call = callback(writer, (test_words, test_tags), loss_function, 
                    delimeter = 10, exp_number = exp_num, description = description)
        
    const = 10
    encoder, decoder = model
    # encoder = EncoderClass(vocab_dim = len(word2idx) + const, 
    #                 num_layers=2, emb_dim=100, hidden_dim=100)
    encoder.to(device)
    # decoder = DecoderClass(vocab_dim=len(word2idx) + const, 
    #                 output_dim=len(tag2idx), num_layers=2, emb_dim=100, hidden_dim=100)
    decoder.to(device)

    optimizer = torch.optim.Adam(
        list(encoder.parameters()) + list(decoder.parameters()), lr=1e-3)

    trainer(count_of_epoch = count_of_epoch,
        batch_size = 64,
        model = (encoder, decoder),
        dataset = (train_words, train_tags), 
        loss_function = loss_function,
        optimizer = optimizer,
        callback = call)
    
    loss, percent = check_quality((encoder, decoder), (test_words, test_tags), loss_function)
    print(f"loss: {loss}, percent: {percent}")


В эксперименте 2 добавили dropout. Размер словаря взят по количеству токенов слов.


In [8]:
vocab_dim = len(word2idx) + 10
layers_dim = 64
# hidden_dim = 128,
num_layers = 2
dropout = 0.4
# hidden_layers = 1
epoch = 2
exp_num = 2
encoder = Encoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, p=dropout)
decoder = Decoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, output_dim=len(tag2idx), p=dropout)
experiment((encoder, decoder), exp_num, epoch, 
           f"{epoch} эпох, dropout {dropout}, {num_layers} слоев, размерностью {layers_dim}. "
           f"Словарь размером {vocab_dim}")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

loss: 0.5350184440612793, percent: 12.894022364364272


В эксперименте 1 убрали dropout, качество снизилось

In [9]:
vocab_dim = len(word2idx) + 10
layers_dim = 64
# hidden_dim = 128,
num_layers = 2
dropout = 0.
# hidden_layers = 1
epoch = 10
exp_num = 1
encoder = Encoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, p=dropout)
decoder = Decoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, output_dim=len(tag2idx), p=dropout)
experiment((encoder, decoder), exp_num, epoch, 
           f"{epoch} эпох, dropout {dropout}, {num_layers} слоев, размерностью {layers_dim}. "
           f"Словарь размером {vocab_dim}")

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

loss: 0.48940831422805786, percent: 13.878790667709723


В эксперименте 0 меньше размерность слоев.
Ничего не изменилось

In [14]:
vocab_dim = len(word2idx) + 10
layers_dim = 10
# hidden_dim = 128,
num_layers = 2
dropout = 0.
# hidden_layers = 1
epoch = 2
exp_num = 0
encoder = Encoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, p=dropout)
decoder = Decoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, output_dim=len(tag2idx), p=dropout)
experiment((encoder, decoder), exp_num, epoch, 
           f"{epoch} эпох, dropout {dropout}, {num_layers} слоев, размерностью {layers_dim}. "
           f"Словарь размером {vocab_dim}")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

loss: 0.6479489803314209, percent: 26.60715107450186


В эксперименте 3 больше слоев.
Обучение доходит до счета в 26% и стопорится 

In [12]:
vocab_dim = len(word2idx) + 10
layers_dim = 64
# hidden_dim = 128,
num_layers = 2
dropout = 0.
# hidden_layers = 1
epoch = 10
exp_num = 3
encoder = Encoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, p=dropout)
decoder = Decoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, output_dim=len(tag2idx), p=dropout)
experiment((encoder, decoder), exp_num, epoch, 
           f"{epoch} эпох, dropout {dropout}, {num_layers} слоев, размерностью {layers_dim}. "
           f"Словарь размером {vocab_dim}")

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

loss: 0.6446879506111145, percent: 26.570337306152503


В эксперименте 4 словарь больше в 2 раза + batchnorm у encoder

In [22]:
vocab_dim = len(word2idx) * 2 + 10
layers_dim = 64
# hidden_dim = 128,
num_layers = 2
dropout = 0.
# hidden_layers = 1
epoch = 2
exp_num = 4
encoder = Encoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, p=dropout)
decoder = Decoder(num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, output_dim=len(tag2idx), p=dropout)
experiment((encoder, decoder), exp_num, epoch, 
           f"{epoch} эпох, dropout {dropout}, {num_layers} слоев, размерностью {layers_dim}. "
           f"Словарь размером {vocab_dim}")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

RuntimeError: running_mean should contain 39 elements not 72446

Эксперимент 6
Cловарь в 2 раза больше, без batchnorm
Качество падает. Видимо, избыточность лишняя

In [17]:
vocab_dim = len(word2idx) * 2 + 10
layers_dim = 64
# hidden_dim = 128,
num_layers = 2
dropout = 0.
# hidden_layers = 1
epoch = 2
exp_num = 6
batch_norm=False
encoder = Encoder(is_batch_norm=batch_norm, num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, p=dropout)
decoder = Decoder(is_batch_norm=batch_norm, num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, output_dim=len(tag2idx), p=dropout)
experiment((encoder, decoder), exp_num, epoch, 
           f"{epoch} эпох, dropout {dropout}, {num_layers} слоев, размерностью {layers_dim}. "
           f"Словарь размером {vocab_dim}")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

KeyboardInterrupt: 

Эксперимент 7
batch norm

Качество ниже в 2 раза, хотя loss даже меньше, интересно

In [20]:
vocab_dim = len(word2idx) + 10
layers_dim = 64
# hidden_dim = 128,
num_layers = 2
dropout = 0.
# hidden_layers = 1
epoch = 2
exp_num = 7
batch_norm=True
encoder = Encoder(is_batch_norm=True, num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, p=dropout)
decoder = Decoder(is_batch_norm=True, num_layers=num_layers, vocab_dim=vocab_dim, emb_dim=layers_dim, hidden_dim=layers_dim, output_dim=len(tag2idx), p=dropout)
experiment((encoder, decoder), exp_num, epoch, 
           f"{epoch} эпох, dropout {dropout}, {num_layers} слоев, размерностью {layers_dim}. "
           f"Словарь размером {vocab_dim}")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

  0%|          | 0/164 [00:00<?, ?it/s]

loss: 0.5146270990371704, percent: 13.998435414845153
