In [19]:
import numpy as np

import torch
import torch.nn as nn
from torch.optim import Adam 
from torchtext import data, vocab
from torch.utils.tensorboard import SummaryWriter

from importlib import reload

import sys
sys.path.insert(0, '../')
import training_utils as tu
import models

from params import SEED, N_SPLITS

from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")


In [15]:
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda')
torch.cuda.set_device(1)

In [16]:
custom_embeddings = vocab.Vectors(name = '../embeddings/ft_native_300_ru_wiki_lenta_lemmatize.vec',
                                  cache = '../embeddings',
                                  unk_init = torch.Tensor.normal_)

In [17]:
NUMS = data.Field(use_vocab=False,dtype=torch.float)

In [36]:
model_kwargs = {
    'embedding_dim': custom_embeddings.dim,
    'hidden_dim': 100,
    'n_layers': 2,
    'bidirectional': True,
    'mlp_sizes' : [54, 100],
    'dropout': 0.5,
    
}


model_path = '../trained_models/rnn/pretrained_embs_jacs'
data_path = '../data/json/kfolds/'
vocab_kwargs = {
    'unk_init': torch.Tensor.normal_
}
label_field = 'category_id'
text_field = 'lemmatized'
other_fields = {'jacs': ('nums', NUMS)}

process_text = {
    'stop_words': russian_stopwords,
    'include_lengths': True,
}
process_labels = {
    
}
optimizer_kwargs = {
    
}
criterion = nn.CrossEntropyLoss()
batch_size = 64
n_epochs = 10

writer_path = f'../runs/rnn_pretrained_embs_jacs'
writer = SummaryWriter(writer_path)

In [38]:
reload(tu)
reload(models)
from models import RNN_plus_MLP as RNN

best_1, final_2 = tu.cross_val_score(RNN, model_kwargs, model_path,
                   custom_embeddings, vocab_kwargs,
                   data_path,
                   label_field,text_field, other_fields,
                   process_text, process_labels,
                   Adam, optimizer_kwargs, criterion,
                   batch_size, n_epochs, writer, device
                  )

In [39]:
model_kwargs = {
    'embedding_dim': custom_embeddings.dim,
    'hidden_dim': 100,
    'n_layers': 2,
    'bidirectional': True,
    'dropout': 0.5,
    
}


model_path = '../trained_models/rnn/pretrained_embs'
data_path = '../data/json/kfolds/'
vocab_kwargs = {
    'unk_init': torch.Tensor.normal_
}
label_field = 'category_id'
text_field = 'lemmatized'
other_fields = {}

process_text = {
    'stop_words': russian_stopwords,
    'include_lengths': True,
}
process_labels = {
    
}
optimizer_kwargs = {
    
}
criterion = nn.CrossEntropyLoss()
batch_size = 64
n_epochs = 8

writer_path = f'../runs/rnn_pretrained_embs'
writer = SummaryWriter(writer_path)

reload(tu)
reload(models)
from models import RNN as RNN

best_2, final_2 = tu.cross_val_score(RNN, model_kwargs, model_path,
                   custom_embeddings, vocab_kwargs,
                   data_path,
                   label_field,text_field, other_fields,
                   process_text, process_labels,
                   Adam, optimizer_kwargs, criterion,
                   batch_size, n_epochs, writer, device
                  )

In [40]:
model_kwargs = {
    'embedding_dim': 200,
    'hidden_dim': 100,
    'n_layers': 2,
    'bidirectional': True,
    'dropout': 0.5,
    
}


model_path = '../trained_models/rnn/vanila'
data_path = '../data/json/kfolds/'
vocab_kwargs = {
    #'unk_init': torch.Tensor.normal_
}
label_field = 'category_id'
text_field = 'lemmatized'
other_fields = {}

process_text = {
    'stop_words': russian_stopwords,
    'include_lengths': True,
}
process_labels = {
    
}
optimizer_kwargs = {
    
}
criterion = nn.CrossEntropyLoss()
batch_size = 64
n_epochs = 8

writer_path = f'../runs/rnn_vanila'
writer = SummaryWriter(writer_path)

reload(tu)
reload(models)
from models import RNN as RNN

best_3, final_3 = tu.cross_val_score(RNN, model_kwargs, model_path,
                   None, vocab_kwargs,
                   data_path,
                   label_field,text_field, other_fields,
                   process_text, process_labels,
                   Adam, optimizer_kwargs, criterion,
                   batch_size, n_epochs, writer, device
                  )

In [45]:
np.mean(best_1), np.mean(best_2), np.mean(best_3)

(0.8860179137115448, 0.8862751726935876, 0.879361325386947)

In [47]:
np.mean(final_2), np.mean(final_3)

(0.8862584871970384, 0.879361325386947)

Тут случился некоторый факап и финальная тончость первой модели потерялась, но это не особо важно, тут мы видим что добавление числовых фичей несколько ухудшает результаты. Так что давайте дальше рассматривать модель без доп фичей и с pretrained embeddings

In [48]:
model_kwargs = {
    'embedding_dim': custom_embeddings.dim,
    'hidden_dim': 200,
    'n_layers': 3,
    'bidirectional': True,
    'dropout': 0.5,
    
}


model_path = '../trained_models/rnn/pretrained_embs_deeper'
data_path = '../data/json/kfolds/'
vocab_kwargs = {
    'unk_init': torch.Tensor.normal_
}
label_field = 'category_id'
text_field = 'lemmatized'
other_fields = {}

process_text = {
    'stop_words': russian_stopwords,
    'include_lengths': True,
}
process_labels = {
    
}
optimizer_kwargs = {
    
}
criterion = nn.CrossEntropyLoss()
batch_size = 64
n_epochs = 8

writer_path = f'../runs/rnn_pretrained_embs_deeper'
writer = SummaryWriter(writer_path)

reload(tu)
reload(models)
from models import RNN as RNN

best_2, final_2 = tu.cross_val_score(RNN, model_kwargs, model_path,
                   custom_embeddings, vocab_kwargs,
                   data_path,
                   label_field,text_field, other_fields,
                   process_text, process_labels,
                   Adam, optimizer_kwargs, criterion,
                   batch_size, n_epochs, writer, device
                  )

In [50]:
np.mean(best_2), np.mean(final_2)

(0.8877118044353111, 0.8869308800056757)