In [None]:
import csv
import fasttext
import hashlib
import os
import pickle
import re
import scipy
import sklearn.metrics

import numpy as np
import pandas as pd

In [None]:
REPO_DIR = '/home/lyubanenko/data/nghack'

In [None]:
TRAIN_FILE = os.path.join(REPO_DIR, 'intent/data/train.bin')
TEST_FILE = os.path.join(REPO_DIR, 'intent/data/text.bin')

FASTTEXT_FULL_FILE = '/home/lyubanenko/data/nghack_tmp/_intent_full.txt'
FASTTEXT_TEST_FILE = '/home/lyubanenko/data/nghack_tmp/_intent_test.txt'

FASTTEXT_TRAIN_FILE = '/home/lyubanenko/data/nghack_tmp/_intent_train.txt'
FASTTEXT_VALID_FILE = '/home/lyubanenko/data/nghack_tmp/_intent_valid.txt'

In [None]:
train_ = pickle.load(open(TRAIN_FILE, "rb"))
test_ = pickle.load(open(TEST_FILE, "rb"))

print(train_.shape, test_.shape)

In [None]:
def clean_text(text):
    text = str(text).strip().lower()
    text = text.replace('\n', ' ')
    text = text.strip("“ ”‘ ’«»\"'?!.;: ")
    text = re.sub(' +', ' ', text)
    return text

def process_df(data):
    data = data[data['text'] != 'nan']
    data['text'] = data['text'].apply(clean_text)
    return data

train_ = process_df(train_)
test_ = process_df(test_)

print(train_.shape, test_.shape)

In [None]:
train_['output'] = train_.apply(lambda row: f'{row.fasttext_label} {row.text}', axis=1)
test_['output'] = test_.apply(lambda row: f'{row.fasttext_label} {row.text}', axis=1)

In [None]:
np.savetxt(FASTTEXT_FULL_FILE, train_.output.values, fmt='%s', delimiter=' ')  
np.savetxt(FASTTEXT_TEST_FILE, test_.output.values, fmt='%s', delimiter=' ')  

In [None]:
hashs = sorted(train_['hash'].unique())
hashs_len = len(hashs)

thresh = 0.8
(train_hashs, valid_hashs) = (hashs[:int(thresh*hashs_len)], hashs[int(thresh*hashs_len)+1:])

train = train_[train_['hash'].isin(train_hashs)]
valid = train_[train_['hash'].isin(valid_hashs)]

print(train.shape, valid.shape, test_.shape)

In [None]:
np.savetxt(FASTTEXT_TRAIN_FILE, train.output.values, fmt='%s', delimiter=' ')  
np.savetxt(FASTTEXT_VALID_FILE, valid.output.values, fmt='%s', delimiter=' ')  

In [None]:
def eval_model(model, test):
    preds = model.predict(test.text.tolist())
    
    pred_labels = [pred[0] for pred in  preds[0]]
    true_labels = test.fasttext_label.values

    return sklearn.metrics.f1_score(true_labels, pred_labels, average='macro')

In [None]:
model_tune = fasttext.load_model('/home/lyubanenko/data/nghack/solution/models/intent.ftz')
print('f1_score', eval_model(model_tune, test_))

In [None]:
tf_model = pickle.load(open("/home/lyubanenko/data/nghack/solution/models/intent_tfidf.bin", "rb"))
tf_char_vectorizer = pickle.load(open("/home/lyubanenko/data/nghack/solution/models/char_vectorizer.bin", "rb"))
tf_word_vectorizer = pickle.load(open("/home/lyubanenko/data/nghack/solution/models/word_vectorizer.bin", "rb"))

In [None]:
X_val_chars = tf_char_vectorizer.transform(test_['text'].tolist())
X_val_words = tf_word_vectorizer.transform(test_['text'].tolist())

X_val = scipy.sparse.hstack([X_val_chars, X_val_words])
tf_preds = tf_model.predict_proba(X_val)

In [None]:
ft_model = fasttext.load_model('/home/lyubanenko/data/nghack/solution/models/intent.ftz')
ft_preds_raw = best_model.predict(list(test_.text.values), k=32)

ft_preds = list()
for labels, probs in zip(ft_preds_raw[0], ft_preds_raw[1]):
    d = sorted(list(zip(labels, probs)), key=lambda x: int(x[0].replace('__label__', '')))
    (labels, probs) = zip(*d)
    ft_preds.append(probs)

In [None]:
res = list()
for pr1, pr2 in zip(tf_preds, ft_preds):
    res.append(f'__label__{np.argmax(((pr1 + pr2) / 2))}')

In [None]:
print('f1_score', sklearn.metrics.f1_score(test_.fasttext_label.values, res, average='macro'))