In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply

In [2]:
from sklearn.externals import joblib

vect = joblib.load('sgd_classifier_v2/tfidf.pkl') 

In [3]:
from tqdm import tqdm_notebook

sgd_list = []
for i in tqdm_notebook(range(1999)):
    sgd_list.append(joblib.load('sgd_classifier_v3/sgd_{}.pkl'.format(i)))




In [4]:
%%time
test_questions = pd.read_csv('./question_eval_set.txt', sep='\t', names=['id', 'c', 'w', 'd_c', 'd_w'])
X_test = vect.transform(test_questions.w.astype('U'))

CPU times: user 15.5 s, sys: 1.3 s, total: 16.8 s
Wall time: 17.7 s


In [6]:
def predict(clf_list, length=2999967, start=0, word_tfidf=X_test):
    predict_list = []
    for clf in tqdm_notebook(clf_list):
        try:
            result = [1-j[0] for j in clf.predict_proba(word_tfidf[start:start+length])]
        except:
            result = np.zeros(length)
        predict_list.append(result)
    return predict_list

In [8]:
%%time
questions = pd.read_csv('./question_train_word.csv')
questions_topics = questions.topics.apply(lambda s: s.split(','))
# question_titles = questions.title.astype('U').apply(lambda s: s.split(','))

CPU times: user 27.2 s, sys: 5.58 s, total: 32.7 s
Wall time: 34.3 s


In [9]:
%%time

from collections import Counter
topic_count = Counter([t for ts in questions_topics for t in ts])
topic_most_common = np.array(topic_count.most_common())
topic_map = {i: v[0] for i, v in enumerate(topic_most_common)}

CPU times: user 1.52 s, sys: 52.3 ms, total: 1.57 s
Wall time: 1.58 s


In [None]:
del questions

In [10]:
def transform(predict_list):
    return np.array(predict_list).T

# import heapq
def top_five(predict_list):
    return np.array(predict_list).argsort()[-5:][::-1]
#     return [i[1] for i in heapq.nlargest(5, ((v, i) for i, v in enumerate(predict_list)))]


def transform_predict(predict_list):
    predict_list_t = transform(predict_list)
    result = []
    for p in predict_list_t:
        result.append(list(map(lambda x: topic_map[x], top_five(p))))
    return result

In [11]:
%%time
predict_list = predict(sgd_list, 217360, word_tfidf=X_test)


CPU times: user 5min 40s, sys: 1min 18s, total: 6min 59s
Wall time: 7min 32s


In [12]:
%time result = transform_predict(predict_list)

CPU times: user 1min 3s, sys: 1min 22s, total: 2min 26s
Wall time: 3min 2s


In [14]:
result_t = [str(test_questions.id[i])+','+','.join(v) for i, v in enumerate(result)]
result_t[0]
# '6215603645409872328,6006627476560013656,-5872443091340192918,4610596224687453206,2339809570377332086,-7506384235581390893'

'6215603645409872328,4610596224687453206,-7506384235581390893,1797701934329285597,6756871902441838709,8852526557827312102'

In [15]:
pd.Series(result_t).to_csv('sgd_result_v3.csv', header=False, index=False, sep=' ')

In [16]:
sgd_list[0]

SGDClassifier(alpha=1e-07, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', n_iter=1, n_jobs=-1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [8]:
predict_train_list = predict(sgd_list, 10**4)




In [12]:
%time result_train = transform_predict(predict_train_list)

CPU times: user 1.99 s, sys: 73.5 ms, total: 2.06 s
Wall time: 2.08 s


In [14]:
from evaluate import evaluate
evaluate(zip(result_train, questions_topics[:10**3]))

(0.5247243359089663, 1.830051516061564, 0.7356566085847854, 1731, 2353)

In [None]:
%%time
predict_list = predict(sgd_list, 217360, word_tfidf=X_test)