In [1]:
import numpy as np
import pandas as pd

import sys
sys.path.insert(0,'../')

import torch
import torch.nn as nn
from torch.optim import Adam
from torchtext import data
from torchtext import vocab
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

from params import SEED
from models import RNN
import training_utils as tu

from nltk.corpus import stopwords
russian_stopwords = stopwords.words("russian")

In [2]:
train = pd.read_json('../data/json/train.json', orient='records', lines=True)
valid = pd.read_json('../data/json/valid.json', orient='records', lines=True)

In [196]:
column_trans = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(tokenizer=lambda x:x, lowercase=False, ngram_range=(1,3)), 'lemmatized'), 
    ])


pipe_lsvc = make_pipeline(column_trans, LinearSVC())

In [197]:
pipe_lsvc.fit(train, train['category_id'])

Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

Accuracy на тренировочной выборке

In [198]:
pred_train_lsvc = pipe_lsvc.predict(train)
print(accuracy_score(pred_train_lsvc, train['category_id']))

0.9977758654590119


Accuracy на отложенной выборке

In [199]:
pred_valid_lsvc = pipe_lsvc.predict(valid)
print(accuracy_score(pred_valid_lsvc, valid['category_id']))

0.8933751429972218


In [11]:
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda')
torch.cuda.set_device(2)

In [12]:
TEXT = data.Field(stop_words=russian_stopwords, include_lengths=True)
LABEL = data.LabelField()
train_fields = {'category_id': ('label', LABEL), 'tokens': ('text', TEXT)}

data_path = '../data/json'

custom_embeddings = vocab.Vectors(name = '../embeddings/custom/custom.vec',
                                  cache = '../embeddings',
                                  unk_init = torch.Tensor.normal_)

train_data = data.TabularDataset(
    path=Path(data_path, f'train.json'),
    format='json',
    fields=train_fields,
)

ID = data.Field(sequential=False, use_vocab=False)
test_fields = {'category_id': ('label', LABEL), 'item_id': ('id', ID), 'tokens': ('text', TEXT)}
test_data = data.TabularDataset(
    path=Path(data_path, f'valid.json'),
    format='json',
    fields=test_fields,
)

TEXT.build_vocab(train_data, vectors=custom_embeddings, unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [13]:
model_path = '../trained_models/final_valid'
n_epochs = 10
batch_size = 64 

input_dim = len(TEXT.vocab)
output_dim = len(LABEL.vocab)
embedding_dim = custom_embeddings.dim
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

hidden_dim = 100
n_layers = 2
bidirectional = True
dropout = 0.5


model = RNN(input_dim,
            output_dim,
            embedding_dim,
            hidden_dim,
            n_layers,
            bidirectional,
            dropout,
            pad_idx)

In [14]:
embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(embeddings)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

In [15]:
train_iterator = data.BucketIterator(train_data,
                                     batch_size=batch_size,
                                     sort_key=lambda ex:len(ex.text),
                                     sort_within_batch=True,
                                     device=device
                                    )

test_iterator = data.BucketIterator(test_data,
                                     batch_size=batch_size,
                                     sort_key=lambda ex:len(ex.text),
                                     sort_within_batch=True,
                                     device=device
                                    )

In [16]:
optimizer = Adam(model.parameters())
critertion = nn.CrossEntropyLoss().to(device)
model = model.to(device)

In [17]:
writer = SummaryWriter('../runs/final_valid')

In [18]:
best, final = tu.train_model(model,
            train_iterator,
            test_iterator,
            optimizer,
            critertion,
            model_path,
            n_epochs,
            '_',
            writer)

In [19]:
best, final

(0.8936683006535948, 0.8898828975905001)

In [20]:
model.load_state_dict(torch.load(model_path))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [124]:
def predict(model, iterator):

    result = torch.LongTensor()
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            ids = batch.id.unsqueeze(1)

            predictions = model.forward(text, text_lengths).squeeze(1)
            predictions = predictions.argmax(dim=1, keepdim=True)
            batch_result = torch.cat([ids, predictions], dim=1).to('cpu')
            result = torch.cat([result, batch_result], dim=0)
    result = result.numpy()

    result = pd.DataFrame(result, columns=['item_id', 'category_id'])
    return result

In [125]:
preds = predict(model, test_iterator)

In [156]:
itos = {i:LABEL.vocab.itos[i] for i in range(len(LABEL.vocab))}
preds['category_id'] = preds['category_id'].map(itos)

In [160]:
results = preds.merge(valid[['item_id','category_id']],
                      left_on='item_id', 
                      right_on='item_id',
                      suffixes=('_pred','_true'))

Accuracy на отложенной выборке:

In [193]:
accuracy_score(results['category_id_true'].values, results['category_id_pred'])

0.8873079751593398

Посчитаем теперь accuracy  на остальных уровнях иерархии на отложенно выборке

In [168]:
import json
with open('../data/label_converter.json') as f:
    label_converter = json.load(f)
    
label_converter = {int(k): {int(k1):v1 for k1,v1 in v.items()}
 for k, v in label_converter.items()
}

def accuracy_on_level(results, level):
    true = results['category_id_true'].map(label_converter[level]).values
    preds = results['category_id_pred'].map(label_converter[level]).values
    return accuracy_score(true, preds)

In [191]:
accuracy_on_level(results, 0)

0.9636787056708612

In [190]:
accuracy_on_level(results, 1)

0.9450686386664487

In [192]:
accuracy_on_level(results, 2)

0.8911586860598137

Тоже самое для SVC

In [204]:
results_lsvc = pd.DataFrame({'category_id_true':valid.category_id.values,
                             'category_id_pred':pred_valid_lsvc
                            } )

In [205]:
accuracy_on_level(results_lsvc, 0)

0.9655478836411179

In [207]:
accuracy_on_level(results_lsvc, 1)

0.9490419186141527

In [206]:
accuracy_on_level(results_lsvc, 2)

0.8969807158032358

In [208]:
accuracy_score(pred_valid_lsvc,valid.category_id.values)

0.8933751429972218