In [52]:
from os.path import join as pathjoin
from data_processing import *
from interpretation import *
from models import *
from training import *
import tqdm

In [2]:
DATA_DIR = '/home/mlepekhin/data'
MODELS_DIR = '/home/mlepekhin/models'

In [64]:
transformer_model = 'DeepPavlov/rubert-base-cased'
MAX_TOKENS = 512

BERT_MODEL_ID = 'allennlp_rubert'
CNN_MODEL_ID = 'allennlp_simple_cnn_ru'
LSTM_MODEL_ID = 'allennlp_simple_lstm_ru'

BERT_BEST_MODEL = pathjoin(MODELS_DIR, BERT_MODEL_ID, 'checkpoints', 'best.th')
CNN_BEST_MODEL = pathjoin(MODELS_DIR, CNN_MODEL_ID, 'checkpoints', 'best.th')
LSTM_BEST_MODEL = pathjoin(MODELS_DIR, LSTM_MODEL_ID, 'checkpoints', 'best.th')

In [65]:
vocab = Vocabulary().from_files(pathjoin(MODELS_DIR, BERT_MODEL_ID, 'vocab'))
model = build_transformer_model(vocab, transformer_model)
model.load_state_dict(torch.load(BERT_BEST_MODEL))

bert_predictor = TextClassifierPredictor(
    model, 
    dataset_reader=build_transformer_dataset_reader(transformer_model, lower=True)
)

Building the model


In [14]:
cnn_vocab = Vocabulary().from_files(pathjoin(MODELS_DIR, CNN_MODEL_ID, 'vocab'))
cnn_model = build_simple_cnn_model(
    cnn_vocab, emb_size=256, output_dim=128, num_filters=32, ngram_filter_sizes=(2, 3, 4, 5)
)
cnn_model.load_state_dict(torch.load(CNN_BEST_MODEL))

cnn_predictor = TextClassifierPredictor(
    cnn_model, 
    dataset_reader=build_dataset_reader(None, lower=True)
)

Building the model


In [17]:
test_df = pd.read_csv(pathjoin(DATA_DIR, 'ru_test'))
test_df.head()

Unnamed: 0.1,Unnamed: 0,target,text
0,726,A7,Глава 1 Приступая к работе 1.1 Знакомство с те...
1,1871,A17,Kawasaki D-Tracker С недавних пор Kawasaki d-t...
2,1265,A17,"По моему , вполне достойные книги , может и не..."
3,205,A11,Тест-драйв Lada Granta : новая надежда автогра...
4,141,A8,"среда , 2 декабря 2009 года , 12.33 Бумага всё..."


In [112]:
index_to_token = vocab.get_index_to_token_vocabulary('labels')
token_to_index = vocab.get_token_to_index_vocabulary('labels')
print(index_to_token)
print(cnn_vocab.get_index_to_token_vocabulary('labels'))

{0: 'A8', 1: 'A12', 2: 'A1', 3: 'A14', 4: 'A11', 5: 'A17', 6: 'A16', 7: 'A4', 8: 'A9', 9: 'A7'}
{0: 'A8', 1: 'A12', 2: 'A1', 3: 'A14', 4: 'A11', 5: 'A17', 6: 'A16', 7: 'A4', 8: 'A9', 9: 'A7'}


In [72]:
probs_bert, probs_cnn = [], []

for text, target in tqdm.tqdm(zip(test_df.text.values, test_df.target.values)):
    probs_bert.append(bert_predictor.predict(text)['probs'])
    probs_cnn.append(cnn_predictor.predict(text)['probs'])
probs_bert = np.array(probs_bert)
probs_cnn = np.array(probs_cnn)

483it [02:02,  3.94it/s]


In [97]:
probs_mix = 0.5 * probs_bert + 0.5 * probs_cnn
predicted_bert = [index_to_token[np.argmax(vec)] for vec in probs_bert]
predicted_cnn = [index_to_token[np.argmax(vec)] for vec in probs_cnn]
predicted_mix = [index_to_token[np.argmax(vec)] for vec in probs_mix]

In [98]:
calc_classifier_metrics(np.array(predicted_bert), test_df.target.values)

label (argum) f1_score 0.7080745341614907 precision 0.7402597402597403 recall 0.6785714285714286
label (fictive) f1_score 0.7999999999999999 precision 0.8695652173913043 recall 0.7407407407407407
label (instruct) f1_score 0.8275862068965517 precision 0.7058823529411765 recall 1.0
label (reporting) f1_score 0.9483568075117371 precision 0.9805825242718447 recall 0.9181818181818182
label (legal) f1_score 0.8148148148148148 precision 0.8461538461538461 recall 0.7857142857142857
label (personal) f1_score 0.6516853932584269 precision 0.5918367346938775 recall 0.725
label (commercial) f1_score 0.9467455621301775 precision 0.9411764705882353 recall 0.9523809523809523
label (research) f1_score 0.8627450980392157 precision 0.8979591836734694 recall 0.8301886792452831
label (info) f1_score 0.4680851063829786 precision 0.3333333333333333 recall 0.7857142857142857
label (eval) f1_score 0.6582278481012658 precision 0.7647058823529411 recall 0.5777777777777777
accuracy 0.8095238095238095


In [99]:
calc_classifier_metrics(np.array(predicted_cnn), test_df.target.values)

label (argum) f1_score 0.6107784431137725 precision 0.6623376623376623 recall 0.5666666666666667
label (fictive) f1_score 0.2926829268292683 precision 0.2608695652173913 recall 0.3333333333333333
label (instruct) f1_score 0.6666666666666667 precision 0.5882352941176471 recall 0.7692307692307693
label (reporting) f1_score 0.9299999999999999 precision 0.9029126213592233 recall 0.9587628865979382
label (legal) f1_score 0.7096774193548387 precision 0.8461538461538461 recall 0.6111111111111112
label (personal) f1_score 0.577319587628866 precision 0.5714285714285714 recall 0.5833333333333334
label (commercial) f1_score 0.9101796407185628 precision 0.8941176470588236 recall 0.926829268292683
label (research) f1_score 0.7526881720430106 precision 0.7142857142857143 recall 0.7954545454545454
label (info) f1_score 0.46874999999999994 precision 0.45454545454545453 recall 0.4838709677419355
label (eval) f1_score 0.6315789473684211 precision 0.7058823529411765 recall 0.5714285714285714
accuracy 0.7

In [100]:
calc_classifier_metrics(np.array(predicted_mix), test_df.target.values)

label (argum) f1_score 0.7204968944099378 precision 0.7532467532467533 recall 0.6904761904761905
label (fictive) f1_score 0.8260869565217391 precision 0.8260869565217391 recall 0.8260869565217391
label (instruct) f1_score 0.8000000000000002 precision 0.7058823529411765 recall 0.9230769230769231
label (reporting) f1_score 0.9569377990430622 precision 0.970873786407767 recall 0.9433962264150944
label (legal) f1_score 0.7586206896551724 precision 0.8461538461538461 recall 0.6875
label (personal) f1_score 0.6956521739130435 precision 0.6530612244897959 recall 0.7441860465116279
label (commercial) f1_score 0.9341317365269461 precision 0.9176470588235294 recall 0.9512195121951219
label (research) f1_score 0.8712871287128713 precision 0.8979591836734694 recall 0.8461538461538461
label (info) f1_score 0.5098039215686274 precision 0.3939393939393939 recall 0.7222222222222222
label (eval) f1_score 0.7 precision 0.8235294117647058 recall 0.6086956521739131
accuracy 0.8178053830227743


In [101]:
import pickle
from os.path import join as pathjoin

def save_model(predictor, vectorizer, model_dir):
    !mkdir {model_dir}
    with open(pathjoin(model_dir, 'predictor'), 'wb') as fout:
        fout.write(pickle.dumps(predictor))
    with open(pathjoin(model_dir, 'vectorizer'), 'wb') as fout:
        fout.write(pickle.dumps(vectorizer))
        
def load_model(model_dir):
    return pickle.loads(open(pathjoin(model_dir, 'predictor'), 'rb').read()),\
           pickle.loads(open(pathjoin(model_dir, 'vectorizer'), 'rb').read())

In [106]:
from scipy import sparse

class BigVectorizer:
    def __init__(self, max_word_features=5000, max_char_features=10000):
        self.vect_word = TfidfVectorizer(
            max_features=max_word_features, lowercase=True, analyzer='word',
            stop_words=stopwords.words('russian'), ngram_range=(1,3),dtype=np.float32
        )
        self.vect_char = TfidfVectorizer(
            max_features=max_char_features, lowercase=True, analyzer='char',
            stop_words=stopwords.words('russian'), ngram_range=(3,6),dtype=np.float32
        )

    def fit_transform(self, X):
        vect_word = self.vect_word.fit_transform(X)
        vect_char = self.vect_char.fit_transform(X)
        return sparse.hstack([vect_word, vect_char])
       
    def transform(self, X):
        vect_word = self.vect_word.transform(X)
        vect_char = self.vect_char.transform(X)
        return sparse.hstack([vect_word, vect_char])

In [103]:
lr_predictor, vectorizer = load_model('simple_lr')

In [109]:
lr_proba = lr_predictor.predict_proba(vectorizer.transform(test_df.text))

In [110]:
predicted_lr = lr_predictor.predict(vectorizer.transform(test_df.text))

In [111]:
calc_classifier_metrics(np.array(predicted_lr), test_df.target.values)

label (argum) f1_score 0.7100591715976331 precision 0.7792207792207793 recall 0.6521739130434783
label (fictive) f1_score 0.7567567567567568 precision 0.6086956521739131 recall 1.0
label (instruct) f1_score 0.6923076923076924 precision 0.5294117647058824 recall 1.0
label (reporting) f1_score 0.9124423963133641 precision 0.9611650485436893 recall 0.868421052631579
label (legal) f1_score 0.9230769230769231 precision 0.9230769230769231 recall 0.9230769230769231
label (personal) f1_score 0.6732673267326732 precision 0.6938775510204082 recall 0.6538461538461539
label (commercial) f1_score 0.9239766081871345 precision 0.9294117647058824 recall 0.9186046511627907
label (research) f1_score 0.8200000000000001 precision 0.8367346938775511 recall 0.803921568627451
label (info) f1_score 0.39999999999999997 precision 0.2727272727272727 recall 0.75
label (eval) f1_score 0.7567567567567567 precision 0.8235294117647058 recall 0.7
accuracy 0.7971014492753623


In [116]:
index_to_index = {}
for class_id, cl in enumerate(lr_predictor.classes_):
    index_to_index[class_id] = token_to_index[cl]

lr_proba_transformed = []
for vec in lr_proba:
    new_vec = np.zeros(len(vec))
    for i in range(len(vec)):
        new_vec[index_to_index[i]] = vec[i]
    lr_proba_transformed.append(new_vec[:])
lr_proba_transformed = np.array(lr_proba_transformed)

In [131]:
probs_mix_lr = 0.3 * probs_bert + 0.4 * lr_proba_transformed + 0.3 * probs_cnn
predicted_mix_lr = [index_to_token[np.argmax(vec)] for vec in probs_mix_lr]
calc_classifier_metrics(np.array(predicted_mix_lr), test_df.target.values)

label (argum) f1_score 0.7272727272727272 precision 0.7792207792207793 recall 0.6818181818181818
label (fictive) f1_score 0.8292682926829269 precision 0.7391304347826086 recall 0.9444444444444444
label (instruct) f1_score 0.8387096774193549 precision 0.7647058823529411 recall 0.9285714285714286
label (reporting) f1_score 0.9573459715639812 precision 0.9805825242718447 recall 0.9351851851851852
label (legal) f1_score 0.8148148148148148 precision 0.8461538461538461 recall 0.7857142857142857
label (personal) f1_score 0.7368421052631581 precision 0.7142857142857143 recall 0.7608695652173914
label (commercial) f1_score 0.9404761904761904 precision 0.9294117647058824 recall 0.9518072289156626
label (research) f1_score 0.8627450980392157 precision 0.8979591836734694 recall 0.8301886792452831
label (info) f1_score 0.5306122448979591 precision 0.3939393939393939 recall 0.8125
label (eval) f1_score 0.7532467532467532 precision 0.8529411764705882 recall 0.6744186046511628
accuracy 0.8322981366459