In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data
import time
from pathlib import Path

from importlib import reload
import models
import training_utils as tu

from params import SEED
from torch.utils.tensorboard import SummaryWriter

import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter

In [3]:
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(2)

In [4]:
from torchtext import vocab

In [5]:
custom_embeddings = vocab.Vectors(name = '../embeddings/ft_native_300_ru_wiki_lenta_lemmatize.vec',
                                  cache = '../embeddings',
                                  unk_init = torch.Tensor.normal_)

In [6]:
from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

In [7]:
from torch.optim import Adam

In [8]:
NUM = data.Field(use_vocab=False, dtype=torch.float)

In [9]:
def make_padder(min_len=4):
    def pad(seq):
        if len(seq) >= min_len:
            return seq
        else:
            return seq + ['<unk>'] * (min_len - len(seq))
    return pad

In [11]:
TEXT = data.Field(preprocessing=make_padder(4))
LABEL = data.LabelField()
fields = {'category_id': ('label', LABEL), 'lemmatized': ('text', TEXT), 'jacs': ('num', NUM),}

In [12]:
train_data, valid_data = data.TabularDataset.splits(
                                        path = '../data/json',
                                        train = 'train.json',
                                        validation = 'valid.json',
                                        format = 'json',
                                        fields = fields
)


In [20]:
import torchtext.vocab as vocab

custom_embeddings = vocab.Vectors(name = '../embeddings/ft_native_300_ru_wiki_lenta_lemmatize.vec',
                                  cache = '../embeddings',
                                  unk_init = torch.Tensor.normal_)

In [73]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, vectors=custom_embeddings)
LABEL.build_vocab(train_data)
print(f'Vocab size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocab size: 296604
Number of classes: 54


In [22]:
BATCH_SIZE = 64

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda ex: len(ex.text),
    sort_within_batch = False,
    device = device)

In [79]:
reload(models)
from models import CNN_jaccard as CNN
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = custom_embeddings.dim
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = len(LABEL.vocab)
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
LINEAR_SIZES = [54, 200] 
model = CNN(INPUT_DIM, OUTPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, LINEAR_SIZES,  DROPOUT, PAD_IDX)

In [80]:
embeddings = TEXT.vocab.vectors

model.embedding.weight.data.copy_(embeddings)

UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [81]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [82]:
model = model.to(device)
criterion = criterion.to(device)

In [84]:
log_dir = '../runs/cnn_jacs'
if Path(log_dir).exists():
    Path(log_dir).rmdir()
    
writer = SummaryWriter(log_dir)

In [None]:
tu.train_model(model, train_iterator, valid_iterator, optimizer,criterion,'cnn_jacs', 10, '_',writer)

	Epoch: 1 | Epoch Time: 5m 36:s 
	Epoch: 2 | Epoch Time: 5m 38:s 


'ᶦ'