In [1]:
import numpy as np

import torch
import torch.nn as nn
from torch.optim import Adam 
from torchtext import data, vocab
from torch.utils.tensorboard import SummaryWriter

from importlib import reload

import sys
sys.path.insert(0, '../')
import training_utils as tu
import models

from params import SEED, N_SPLITS


In [2]:
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda')
torch.cuda.set_device(1)

In [3]:
custom_embeddings = vocab.Vectors(name = '../embeddings/custom/custom.vec',
                                  cache = '../embeddings',
                                  unk_init = torch.Tensor.normal_)

  0%|          | 0/97568 [00:00<?, ?it/s]Skipping token b'97568' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 97313/97568 [00:40<00:00, 3746.89it/s]

In [4]:
NUMS = data.Field(use_vocab=False,dtype=torch.float)

In [9]:
model_kwargs = {
    'embedding_dim': custom_embeddings.dim,
    'hidden_dim': 100,
    'n_layers': 2,
    'bidirectional': True,
    'mlp_sizes' : [54, 100],
    'dropout': 0.5,
    
}


model_path = '../trained_models/rnn/my_embs'
data_path = '../data/json/kfolds/'
vocab_kwargs = {
    'unk_init': torch.Tensor.normal_
}
label_field = 'category_id'
text_field = 'tokens'
other_fields = {'jacs': ('nums', NUMS)}

process_text = {
    'include_lengths': True,
}
process_labels = {
    
}
optimizer_kwargs = {
    
}
criterion = nn.CrossEntropyLoss()
batch_size = 128
n_epochs = 10

writer_path = f'../runs/rnn_my_embs_jacs'
writer = SummaryWriter(writer_path)

In [10]:
reload(tu)
reload(models)
from models import RNN_plus_MLP as RNN

best_1, final_1 = tu.cross_val_score(RNN, model_kwargs, model_path,
                   custom_embeddings, vocab_kwargs,
                   data_path,
                   label_field,text_field, other_fields,
                   process_text, process_labels,
                   Adam, optimizer_kwargs, criterion,
                   batch_size, n_epochs, writer, device
                  )

In [None]:
model_kwargs = {
    'embedding_dim': custom_embeddings.dim,
    'hidden_dim': 100,
    'n_layers': 2,
    'bidirectional': True,
    'dropout': 0.5,
    
}


model_path = '../trained_models/rnn/pretrained_embs'
data_path = '../data/json/kfolds/'
vocab_kwargs = {
    'unk_init': torch.Tensor.normal_
}
label_field = 'category_id'
text_field = 'lemmatized'
other_fields = {}

process_text = {
    'stop_words': russian_stopwords,
    'include_lengths': True,
}
process_labels = {
    
}
optimizer_kwargs = {
    
}
criterion = nn.CrossEntropyLoss()
batch_size = 64
n_epochs = 8

writer_path = f'../runs/rnn_pretrained_embs'
writer = SummaryWriter(writer_path)

reload(tu)
reload(models)
from models import RNN as RNN

best_2, final_2 = tu.cross_val_score(RNN, model_kwargs, model_path,
                   custom_embeddings, vocab_kwargs,
                   data_path,
                   label_field,text_field, other_fields,
                   process_text, process_labels,
                   Adam, optimizer_kwargs, criterion,
                   batch_size, n_epochs, writer, device
                  )

In [None]:
model_kwargs = {
    'embedding_dim': 200,
    'hidden_dim': 100,
    'n_layers': 2,
    'bidirectional': True,
    'dropout': 0.5,
    
}


model_path = '../trained_models/rnn/vanila'
data_path = '../data/json/kfolds/'
vocab_kwargs = {
    #'unk_init': torch.Tensor.normal_
}
label_field = 'category_id'
text_field = 'lemmatized'
other_fields = {}

process_text = {
    'stop_words': russian_stopwords,
    'include_lengths': True,
}
process_labels = {
    
}
optimizer_kwargs = {
    
}
criterion = nn.CrossEntropyLoss()
batch_size = 64
n_epochs = 8

writer_path = f'../runs/rnn_vanila'
writer = SummaryWriter(writer_path)

reload(tu)
reload(models)
from models import RNN as RNN

best_3, final_3 = tu.cross_val_score(RNN, model_kwargs, model_path,
                   None, vocab_kwargs,
                   data_path,
                   label_field,text_field, other_fields,
                   process_text, process_labels,
                   Adam, optimizer_kwargs, criterion,
                   batch_size, n_epochs, writer, device
                  )

In [11]:
best_1.mean()

[0.8894652324632952,
 0.8924999108501509,
 0.8867909121357538,
 0.8911381487558091,
 0.8914828430982976]

In [None]:
np.mean(best_1), np.mean(best_2), np.mean(best_3)

In [None]:
np.mean(final_1), np.mean(final_2), np.mean(final_3)