In [None]:
import csv
import fasttext
import hashlib
import os
import pickle
import re
import scipy
import sklearn.metrics
import joblib

import numpy as np
import pandas as pd

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

In [None]:
REPO_DIR = '/home/lyubanenko/data/nghack'

In [None]:
TRAIN_FILE = os.path.join(REPO_DIR, 'intent/data/train.bin')
TEST_FILE = os.path.join(REPO_DIR, 'intent/data/text.bin')

FASTTEXT_FULL_FILE = '/home/lyubanenko/data/nghack_tmp/_intent_full.txt'
FASTTEXT_TEST_FILE = '/home/lyubanenko/data/nghack_tmp/_intent_test.txt'

FASTTEXT_TRAIN_FILE = '/home/lyubanenko/data/nghack_tmp/_intent_train.txt'
FASTTEXT_VALID_FILE = '/home/lyubanenko/data/nghack_tmp/_intent_valid.txt'

In [None]:
train_ = pickle.load(open(TRAIN_FILE, "rb"))
test_ = pickle.load(open(TEST_FILE, "rb"))

print(train_.shape, test_.shape)

In [None]:
def clean_text(text):
    text = str(text).strip().lower()
    text = text.replace('\n', ' ')
    text = text.strip("“ ”‘ ’«»\"'?!.;: ")
    text = re.sub(' +', ' ', text)
    return text

def process_df(data):
    data = data[data['text'] != 'nan']
    data['text'] = data['text'].apply(clean_text)
    data['target'] = data['fasttext_label'].apply(lambda x: int(x.replace('__label__', '')))
    return data

train_ = process_df(train_)
test_ = process_df(test_)

print(train_.shape, test_.shape)

In [None]:
char_vectorizer = TfidfVectorizer(ngram_range=(1, 5), analyzer='char')
word_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
model = LogisticRegression(C=10)

In [None]:
X_chars = char_vectorizer.fit_transform(train_['text'].tolist())
X_words = word_vectorizer.fit_transform(train_['text'].tolist())

In [None]:
X = scipy.sparse.hstack([X_chars, X_words])

model.fit(X, train_['target'].tolist())

In [None]:
X_val_chars = char_vectorizer.transform(test_['text'].tolist())
X_val_words = word_vectorizer.transform(test_['text'].tolist())

X_val = scipy.sparse.hstack([X_val_chars, X_val_words])
val_preds = model.predict(X_val)

In [None]:
pickle.dump(model, open('/home/lyubanenko/data/nghack/solution/models/intent_tfidf.bin', "wb"))
pickle.dump(char_vectorizer, open('/home/lyubanenko/data/nghack/solution/models/char_vectorizer.bin', "wb"))
pickle.dump(word_vectorizer, open('/home/lyubanenko/data/nghack/solution/models/word_vectorizer.bin', "wb"))