In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply

In [2]:
most_common_tag = '4697014490911193675'

In [3]:
%%time
questions = pd.read_csv('./question_train_word.csv')
questions_topics = questions.topics.apply(lambda s: s.split(','))
question_titles = questions.title.astype('U').apply(lambda s: s.split(','))

CPU times: user 27.9 s, sys: 7.59 s, total: 35.5 s
Wall time: 40.6 s


In [4]:
question_words = [w for ws in question_titles for w in ws]

In [5]:
print(len(question_words))

38726571


In [6]:
%%time
from collections import Counter
word_counter = Counter(question_words)

CPU times: user 8.36 s, sys: 1.29 s, total: 9.65 s
Wall time: 10.6 s


In [7]:
word_most_common = word_counter.most_common()
print(len(word_most_common))
print(word_most_common[:10])
print(word_most_common[-10:])

324960
[('w111', 2976600), ('w6', 1921785), ('w11', 1474827), ('w109', 552983), ('w54', 547054), ('w25', 526597), ('w471', 433385), ('w4016', 419135), ('w1110', 389917), ('w10147', 351932)]
[('w1138549', 1), ('w195062', 1), ('w344968', 1), ('w543817', 1), ('w1138560', 1), ('w99918', 1), ('w219988', 1), ('w451129', 1), ('w268154', 1), ('w1138582', 1)]


In [8]:
word_low_frequency = [w[0] for w in word_most_common if w[1] <= 5]
print(len(word_low_frequency))

204962


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
vector = Pipeline([('vect', CountVectorizer(stop_words=word_low_frequency)),
                   ('tfidf', TfidfTransformer())])

In [10]:
input_question_titles = [' '.join(t) for t in question_titles]
print(input_question_titles[0])

w305 w13549 w22752 w11 w7225 w2565 w1106 w16 w31389 w6 w1019 w69288 w111 w3332 w109 w11 w25 w1110 w111


In [11]:
%%time
word_tfidf = vector.fit_transform(input_question_titles)

CPU times: user 1min 1s, sys: 4.84 s, total: 1min 6s
Wall time: 1min 10s


In [12]:
print(word_tfidf.shape)

(2999967, 119998)


In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
%%time
y_most_common_tag = []
for topics in questions_topics:
    if most_common_tag in topics:
        y_most_common_tag.append(1)
    else:
        y_most_common_tag.append(0)

CPU times: user 1.01 s, sys: 1.03 s, total: 2.04 s
Wall time: 2.41 s


In [15]:
sum(y_most_common_tag[:1000])

18

In [16]:
%%time
rf = RandomForestClassifier(n_estimators=30, verbose=1, n_jobs=-1)
rf.fit(word_tfidf[:1000], y_most_common_tag[:1000])

CPU times: user 780 ms, sys: 42 ms, total: 822 ms
Wall time: 426 ms


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.3s finished


In [17]:
rf.score(word_tfidf[1000:2000], y_most_common_tag[1000:2000])

[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.0s finished


0.98999999999999999

In [18]:
%%time
rf_list = [RandomForestClassifier(n_estimators=30, n_jobs=-1) for _ in range(1000)]

CPU times: user 9.83 ms, sys: 1.85 ms, total: 11.7 ms
Wall time: 11.5 ms


In [19]:
topics_info = pd.read_csv('./question_topic_train_set.txt', sep='\t', names=['q_id', 't_ids'])
topic_count = Counter([t for ts in topics_info.t_ids.astype('U') for t in ts.split(',')])
print(topic_count.most_common(3))

[('7476760589625268543', 66259), ('4697014490911193675', 49428), ('-4653836020042332281', 45482)]


In [20]:
topic_most_common = topic_count.most_common()

In [166]:
from tqdm import tqdm_notebook

In [21]:
%%time
y_tag_list = []
def tag_index(i, length=len(questions_topics)):
    tag = topic_most_common[i][0]
    y_index = []
    for topics in questions_topics[:length]:
        if tag in topics:
            y_index.append(1)
        else:
            y_index.append(0)
    return y_index

from tqdm import tqdm

for i in tqdm(range(1000)):
    y_tag_list.append(tag_index(i, 10000))

100%|██████████| 1000/1000 [00:02<00:00, 402.38it/s]

CPU times: user 2.39 s, sys: 78.4 ms, total: 2.47 s
Wall time: 2.5 s





In [55]:
for i in tqdm(range(1000)):
    _rf = rf_list[i]
    _rf.fit(word_tfidf[:1000], y_tag_list[i][:1000])


  0%|          | 0/1000 [00:00<?, ?it/s][A
  0%|          | 1/1000 [00:00<07:30,  2.22it/s][A
  0%|          | 2/1000 [00:00<06:29,  2.56it/s][A
  0%|          | 3/1000 [00:00<05:18,  3.13it/s][A
  0%|          | 4/1000 [00:01<04:58,  3.34it/s][A
  0%|          | 5/1000 [00:01<04:43,  3.51it/s][A
  1%|          | 6/1000 [00:01<04:30,  3.67it/s][A
  1%|          | 7/1000 [00:01<04:57,  3.33it/s][A
  1%|          | 8/1000 [00:02<04:40,  3.53it/s][A
  1%|          | 9/1000 [00:02<04:28,  3.69it/s][A
  1%|          | 10/1000 [00:02<04:49,  3.42it/s][A
  1%|          | 11/1000 [00:03<04:36,  3.58it/s][A
  1%|          | 12/1000 [00:03<03:55,  4.20it/s][A
  1%|▏         | 13/1000 [00:03<03:58,  4.14it/s][A
  1%|▏         | 14/1000 [00:03<04:03,  4.05it/s][A
  2%|▏         | 15/1000 [00:03<03:36,  4.55it/s][A
  2%|▏         | 16/1000 [00:04<03:44,  4.37it/s][A
  2%|▏         | 17/1000 [00:04<03:19,  4.94it/s][A
  2%|▏         | 18/1000 [00:04<03:01,  5.40it/s][A
  2%|▏    

In [30]:
rf_list[1].score(word_tfidf[:1000], y_tag_list[1])

0.996

In [56]:
predict_list = []
for i in tqdm(range(1000)):
    # Fixme: rf_list[2]
    predict_list.append([1-j[0] for j in rf_list[i].predict_proba(word_tfidf[:1000])])

100%|██████████| 500/500 [00:54<00:00,  9.17it/s]


In [24]:
def transform(predict_list):
    result = []
    for i in range(len(predict_list[0])):
        _list = []
        for j in range(len(predict_list)):
            _list.append(predict_list[j][i])
        result.append(_list)
    return result

import heapq
def top_five(predict_list):
    return [i[1] for i in heapq.nlargest(5, ((v, i) for i, v in enumerate(predict_list)))]

topic_map = {i: v[0] for i, v in enumerate(topic_most_common)}

def transform_predict(predict_list):
    predict_list_t = transform(predict_list)
    result = []
    for p in predict_list_t:
        result.append(list(map(lambda x: topic_map[x], top_five(p))))
    return result

In [59]:
%pdb
result = transform_predict(predict_list)

Automatic pdb calling has been turned ON


In [60]:
result[0]

['3738968195649774859',
 '7739004195693774975',
 '-965420152559594547',
 '-3174907002942471215',
 '-7965124443634034243']

In [62]:
from evaluate import evaluate
evaluate(zip(result, questions_topics[:1000]))

(0.5223471575440352, 1.9062320934397274, 0.7195070123246918, 1693, 2353)

In [70]:
def train_predict_range(X):
    predict_list = []
    for i in tqdm(range(1000)):
        # Fixme: rf_list[2]
        predict_list.append([1-j[0] for j in rf_list[i].predict_proba(X)])
    return predict_list

In [71]:
_predict = train_predict_range(word_tfidf[1000:2000])
evaluate(zip(transform_predict(_predict), questions_topics[1000:2000]))

100%|██████████| 1000/1000 [01:47<00:00,  9.17it/s]


(0.06295013090174784, 0.22488706142708415, 0.08742088607594936, 221, 2528)

### check overall time cost

In [25]:
%%time
y_tag = tag_index(0)

CPU times: user 784 ms, sys: 1.04 s, total: 1.82 s
Wall time: 2.28 s


In [None]:
%%time
rf = RandomForestClassifier(n_estimators=30, verbose=2, n_jobs=-1)
rf.fit(word_tfidf, y_tag)

building tree 1 of 30
building tree 3 of 30building tree 2 of 30building tree 4 of 30




In [119]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss="log", penalty="l2")

In [120]:
%%time
clf.fit(word_tfidf[:10000], y_tag[:10000])

CPU times: user 24.6 ms, sys: 6.41 ms, total: 31.1 ms
Wall time: 34.7 ms


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [122]:
from evaluate import evaluate
print(clf.score(word_tfidf[:10000], y_tag[:10000]))
print(clf.score(word_tfidf[10000:20000], y_tag[10000:20000]))

0.9999
0.9994


In [35]:
clf.predict_proba(word_tfidf[:10])

array([[ 0.99574532,  0.00425468],
       [ 0.99306338,  0.00693662],
       [ 0.99206804,  0.00793196],
       [ 0.99414264,  0.00585736],
       [ 0.99617903,  0.00382097],
       [ 0.9952333 ,  0.0047667 ],
       [ 0.99768908,  0.00231092],
       [ 0.9934937 ,  0.0065063 ],
       [ 0.99420678,  0.00579322],
       [ 0.99000853,  0.00999147]])

In [37]:
y_tag_list[0][:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [64]:
sgd_list = [SGDClassifier(loss="modified_huber", penalty="l2") for _ in range(1000)]

In [65]:
# sgd_list[i].fit(word_tfidf[:1000], y_tag_list[i][:1000])
clf.fit(word_tfidf[:1000], y_tag_list[3][:1000])
len(y_tag_list[i][:1000])

1000

In [66]:
filter_list = []
for i in tqdm(range(1000)):
    _sgd = sgd_list[i]
    # not allow all zero
    if not sum(y_tag_list[i][:10000]):
        filter_list.append(i)
        continue
    _sgd.fit(word_tfidf[:10000], y_tag_list[i][:10000])

100%|██████████| 1000/1000 [00:11<00:00, 68.18it/s]


In [58]:
print(len(filter_list))
sgd_list_work = [s for i, s in enumerate(sgd_list) if i not in filter_list]
print(len(sgd_list_work))

27
973


In [170]:

def predict(clf_list, length=2999967, start=0, word_tfidf=word_tfidf):
    predict_list = []
    for clf in tqdm_notebook(clf_list):
        try:
            result = [1-j[0] for j in clf.predict_proba(word_tfidf[start:start+length])]
        except:
            result = np.zeros(length)
        predict_list.append(result)
    return predict_list

In [67]:
predict_list = predict(sgd_list, 10000)

100%|██████████| 1000/1000 [00:07<00:00, 127.00it/s]


## train result is good

In [68]:
result = transform_predict(predict_list)
from evaluate import evaluate
evaluate(zip(result, questions_topics[:10000]))
# log: (0.320640219340169, 1.17715423397403, 0.4406734569778048, 11595, 26312)

(0.5516961015800961, 2.1273217689302975, 0.744869261173609, 19599, 26312)

## `modified_huber`: great score
but maybe **overfitting**

In [71]:
predict_list = predict(sgd_list, 1000, 10000)
result = transform_predict(predict_list)
evaluate(zip(result, questions_topics[10000:11000]))

100%|██████████| 1000/1000 [00:01<00:00, 935.24it/s]


(0.179914290747037, 0.6888552071915998, 0.2435152922957801, 629, 2583)

In [78]:
%%time
clf = SGDClassifier(loss="modified_huber", penalty="l2")
clf.fit(word_tfidf, y_tag)

CPU times: user 6.61 s, sys: 569 ms, total: 7.18 s
Wall time: 7.53 s


## 8s per topic

In [96]:
sgd_list = [SGDClassifier(loss="modified_huber", penalty="l2", n_jobs=-1) for _ in range(1999)]

In [86]:
%%time
test_questions = pd.read_csv('./question_eval_set.txt', sep='\t', names=['id', 'c', 'w', 'd_c', 'd_w'])

CPU times: user 3.26 s, sys: 568 ms, total: 3.83 s
Wall time: 4.15 s


In [92]:
%%time
test_title = [t.replace(',', ' ') for t in test_questions.w.astype('U')]
print(test_title[0])

w1340 w1341 w55 w1344 w58 w6 w24178 w26959 w471 w111 w642 w471 w3228 w20104 w19234 w6 w18505 w111
CPU times: user 137 ms, sys: 14.8 ms, total: 151 ms
Wall time: 151 ms


In [93]:
%%time
X_test = vector.transform(test_title)

CPU times: user 4.29 s, sys: 1.24 s, total: 5.53 s
Wall time: 6.31 s


In [94]:
X_test.shape

(217360, 119998)

## Training, real!

In [None]:
from sklearn.externals import joblib

for i in tqdm(range(1999)):
    sgd = sgd_list[i]
    y_tag = tag_index(i)
    sgd.fit(word_tfidf, y_tag) 
    joblib.dump(clf, 'sgd_classifier/sgd_{}.pkl'.format(i)) 


  0%|          | 0/1999 [00:00<?, ?it/s][A
 81%|████████  | 1612/1999 [8:31:04<44:34,  6.91s/it]     

In [None]:
%%time
predict_list = predict(sgd_list, 217360, word_tfidf=X_test)

In [105]:
%%time
result = transform_predict(predict_list)

CPU times: user 5min 6s, sys: 1min 12s, total: 6min 18s
Wall time: 7min 6s


In [100]:
print('%doctest_modene')

done


In [101]:
len(result)

217360

In [106]:
result = [str(test_questions.id[i])+','+','.join(v) for i, v in enumerate(result)]
result[0]

'6215603645409872328,6006627476560013656,-5872443091340192918,4610596224687453206,2339809570377332086,-7506384235581390893'

In [111]:
pd.Series(result).to_csv('sgd_result.csv', header=False, index=False, sep=' ')
# test_questions['result']
# with open('sgd_result.csv', 'w') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerow(result)

## Score: 0.35762

## Optimise Params

In [112]:
from sklearn.model_selection import GridSearchCV

In [113]:
params = {'alpha': 10.0**-np.arange(1,7)}

In [115]:
text_clf = SGDClassifier(loss="modified_huber", penalty="l2")
gs_clf = GridSearchCV(text_clf, params, n_jobs=-1)

In [116]:
%%time
_ = gs_clf.fit(word_tfidf[:10000], y_tag_list[0][:10000])

CPU times: user 10.4 s, sys: 1min 12s, total: 1min 22s
Wall time: 2min 50s


In [117]:
for param_name in sorted(params.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

alpha: 0.10000000000000001


In [123]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.977
{'alpha': 0.10000000000000001}


In [138]:
sgd_test = SGDClassifier(loss="modified_huber", penalty="l2", alpha=0.001)
sgd_test.fit(word_tfidf[:10000], tag_index(0, 10000)[:10000])

SGDClassifier(alpha=0.001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [151]:
sgd_list_opt = [SGDClassifier(loss="modified_huber", penalty="l2", alpha=0.001) for _ in range(1000)]
def svg_fit(sgd_list, length):
    for i in tqdm(range(len(sgd_list))):
        sgd = sgd_list[i]
#         y_tag = tag_index(i)
        y_tag = y_tag_list[i]
        if not sum(y_tag[:length]):
            filter_list.append(i)
            continue
        sgd.fit(word_tfidf[:length], y_tag[:length])
svg_fit(sgd_list_opt, 10000)


  0%|          | 0/1000 [00:00<?, ?it/s][A
  0%|          | 4/1000 [00:00<00:36, 27.21it/s][A
  1%|          | 9/1000 [00:00<00:32, 30.68it/s][A
  1%|▏         | 14/1000 [00:00<00:29, 33.11it/s][A
  2%|▏         | 18/1000 [00:00<00:28, 34.30it/s][A
  2%|▏         | 24/1000 [00:00<00:25, 37.59it/s][A
  3%|▎         | 28/1000 [00:00<00:25, 37.91it/s][A
  4%|▎         | 35/1000 [00:00<00:22, 42.65it/s][A
  4%|▍         | 40/1000 [00:00<00:23, 41.59it/s][A
  4%|▍         | 45/1000 [00:01<00:22, 42.24it/s][A
  5%|▌         | 50/1000 [00:01<00:21, 43.81it/s][A
  6%|▌         | 55/1000 [00:01<00:21, 44.53it/s][A
  6%|▌         | 60/1000 [00:01<00:22, 41.76it/s][A
  6%|▋         | 65/1000 [00:01<00:22, 41.85it/s][A
  7%|▋         | 71/1000 [00:01<00:20, 45.43it/s][A
  8%|▊         | 76/1000 [00:01<00:20, 44.22it/s][A
  8%|▊         | 81/1000 [00:01<00:22, 41.69it/s][A
  9%|▊         | 86/1000 [00:02<00:23, 39.63it/s][A
  9%|▉         | 91/1000 [00:02<00:22, 40.44it/s][A
 10

In [133]:
def evl(sgd_list):
    predict_list = predict(sgd_list, 10000)
    result = transform_predict(predict_list)
    return(evaluate(zip(result, questions_topics[:10000])))

In [152]:
evl(sgd_list_opt)


  0%|          | 0/1000 [00:00<?, ?it/s][A
  1%|          | 7/1000 [00:00<00:15, 65.37it/s][A
  1%|▏         | 14/1000 [00:00<00:14, 66.26it/s][A
  2%|▏         | 21/1000 [00:00<00:14, 67.12it/s][A
  3%|▎         | 29/1000 [00:00<00:13, 70.51it/s][A
  4%|▍         | 41/1000 [00:00<00:11, 80.07it/s][A
  5%|▌         | 54/1000 [00:00<00:10, 89.83it/s][A
  7%|▋         | 70/1000 [00:00<00:09, 102.51it/s][A
  9%|▊         | 86/1000 [00:00<00:08, 113.57it/s][A
 10%|█         | 102/1000 [00:00<00:07, 122.69it/s][A
 12%|█▏        | 118/1000 [00:01<00:06, 129.90it/s][A
 13%|█▎        | 134/1000 [00:01<00:06, 135.29it/s][A
 15%|█▌        | 150/1000 [00:01<00:06, 136.54it/s][A
 16%|█▋        | 165/1000 [00:01<00:06, 124.78it/s][A
 18%|█▊        | 178/1000 [00:01<00:06, 123.80it/s][A
 19%|█▉        | 191/1000 [00:01<00:06, 124.84it/s][A
 20%|██        | 204/1000 [00:01<00:06, 125.70it/s][A
 22%|██▏       | 217/1000 [00:01<00:06, 122.28it/s][A
 23%|██▎       | 230/1000 [00:01<00

(0.5458860089130148, 2.0812873690832965, 0.7399665551839465, 19470, 26312)

In [141]:
# predict_list = predict(sgd_list, 1000, 10000)
result_t = transform_predict(predict(sgd_list_opt, 1000, 10000))
evaluate(zip(result_t, questions_topics[10000:11000]))


  0%|          | 0/1000 [00:00<?, ?it/s][A
  0%|          | 1/1000 [00:00<01:53,  8.80it/s][A
  2%|▏         | 17/1000 [00:00<01:20, 12.28it/s][A
  3%|▎         | 33/1000 [00:00<00:56, 16.98it/s][A
  5%|▌         | 52/1000 [00:00<00:40, 23.34it/s][A
  7%|▋         | 73/1000 [00:00<00:29, 31.77it/s][A
 10%|▉         | 96/1000 [00:00<00:21, 42.77it/s][A
 13%|█▎        | 132/1000 [00:00<00:14, 58.02it/s][A
 17%|█▋        | 166/1000 [00:00<00:10, 77.15it/s][A
 20%|█▉        | 198/1000 [00:00<00:08, 99.33it/s][A
 23%|██▎       | 226/1000 [00:01<00:06, 117.97it/s][A
 25%|██▌       | 252/1000 [00:01<00:06, 109.05it/s][A
 27%|██▋       | 273/1000 [00:01<00:05, 126.57it/s][A
 29%|██▉       | 294/1000 [00:01<00:05, 141.14it/s][A
 32%|███▏      | 323/1000 [00:01<00:04, 166.66it/s][A
 35%|███▌      | 353/1000 [00:01<00:03, 191.83it/s][A
 40%|███▉      | 395/1000 [00:01<00:02, 229.04it/s][A
 44%|████▍     | 440/1000 [00:01<00:02, 267.99it/s][A
 49%|████▊     | 486/1000 [00:02<00:

(0.19492309164982197, 0.7416286640663764, 0.26442121564072785, 683, 2583)

In [145]:
sgd_list_opt = [SGDClassifier(loss="modified_huber", penalty="l2", alpha=0.01) for _ in range(1000)]
svg_fit(sgd_list_opt, 10000)


  0%|          | 0/1000 [00:00<?, ?it/s][A
  0%|          | 3/1000 [00:00<00:44, 22.44it/s][A
  1%|          | 8/1000 [00:00<00:37, 26.17it/s][A
  1%|          | 12/1000 [00:00<00:34, 28.27it/s][A
  2%|▏         | 16/1000 [00:00<00:31, 30.89it/s][A
  2%|▏         | 22/1000 [00:00<00:27, 35.27it/s][A
  3%|▎         | 27/1000 [00:00<00:25, 38.03it/s][A
  3%|▎         | 31/1000 [00:00<00:27, 35.65it/s][A
  4%|▎         | 36/1000 [00:00<00:25, 37.82it/s][A
  4%|▍         | 41/1000 [00:01<00:23, 40.37it/s][A
  5%|▍         | 46/1000 [00:01<00:23, 41.39it/s][A
  5%|▌         | 51/1000 [00:01<00:23, 40.02it/s][A
  6%|▌         | 56/1000 [00:01<00:22, 42.39it/s][A
  6%|▌         | 62/1000 [00:01<00:20, 45.46it/s][A
  7%|▋         | 67/1000 [00:01<00:20, 46.21it/s][A
  7%|▋         | 72/1000 [00:01<00:21, 42.84it/s][A
  8%|▊         | 77/1000 [00:01<00:22, 41.91it/s][A
  8%|▊         | 83/1000 [00:01<00:20, 44.98it/s][A
  9%|▉         | 88/1000 [00:02<00:19, 45.65it/s][A
  9

In [146]:
evl(sgd_list_opt)


  0%|          | 0/1000 [00:00<?, ?it/s][A
  1%|          | 8/1000 [00:00<00:12, 76.68it/s][A
  2%|▏         | 15/1000 [00:00<00:13, 73.87it/s][A
  2%|▏         | 21/1000 [00:00<00:14, 68.84it/s][A
  3%|▎         | 29/1000 [00:00<00:13, 69.67it/s][A
  4%|▍         | 38/1000 [00:00<00:13, 72.88it/s][A
  5%|▍         | 46/1000 [00:00<00:13, 73.02it/s][A
  6%|▌         | 55/1000 [00:00<00:12, 75.52it/s][A
  6%|▋         | 65/1000 [00:00<00:11, 80.86it/s][A
  7%|▋         | 74/1000 [00:00<00:11, 82.02it/s][A
  8%|▊         | 84/1000 [00:01<00:10, 86.31it/s][A
  9%|▉         | 94/1000 [00:01<00:10, 88.57it/s][A
 10%|█         | 104/1000 [00:01<00:09, 91.14it/s][A
 11%|█▏        | 114/1000 [00:01<00:10, 81.11it/s][A
 12%|█▏        | 123/1000 [00:01<00:11, 76.43it/s][A
 13%|█▎        | 131/1000 [00:01<00:11, 74.67it/s][A
 14%|█▍        | 139/1000 [00:01<00:11, 75.26it/s][A
 15%|█▍        | 147/1000 [00:01<00:11, 76.56it/s][A
 16%|█▌        | 155/1000 [00:01<00:11, 75.70it/s

(0.4150651017581139, 1.442769560939586, 0.5826999087868653, 15332, 26312)

In [165]:
result_t = transform_predict(predict(sgd_list_opt, 1000, 10000))
evaluate(zip(result_t, questions_topics[10000:11000]))



  0%|          | 0/1000 [00:00<?, ?it/s][A[A

  5%|▌         | 54/1000 [00:00<00:01, 524.80it/s][A[A

  8%|▊         | 85/1000 [00:00<00:02, 427.89it/s][A[A

 11%|█▏        | 113/1000 [00:00<00:02, 368.37it/s][A[A

 14%|█▍        | 138/1000 [00:00<00:02, 319.70it/s][A[A

 17%|█▋        | 169/1000 [00:00<00:02, 315.87it/s][A[A

 20%|██        | 204/1000 [00:00<00:02, 323.25it/s][A[A

 23%|██▎       | 234/1000 [00:00<00:02, 314.60it/s][A[A

 26%|██▋       | 263/1000 [00:00<00:02, 303.64it/s][A[A

 29%|██▉       | 292/1000 [00:00<00:02, 278.60it/s][A[A

 32%|███▏      | 319/1000 [00:01<00:02, 260.57it/s][A[A

 34%|███▍      | 345/1000 [00:01<00:02, 239.25it/s][A[A

 37%|███▋      | 371/1000 [00:01<00:02, 243.21it/s][A[A

 40%|████      | 400/1000 [00:01<00:02, 254.43it/s][A[A

 43%|████▎     | 429/1000 [00:01<00:02, 259.00it/s][A[A

 46%|████▌     | 456/1000 [00:01<00:02, 247.72it/s][A[A

 49%|████▉     | 488/1000 [00:01<00:01, 260.18it/s][A[A

 52%|███

(0.1944314759031781, 0.7375663964230609, 0.26403406891211767, 682, 2583)

#### alphe = 0.0001
(0.5516961015800961, 2.1273217689302975, 0.744869261173609, 19599, 26312)

(0.179914290747037, 0.6888552071915998, 0.2435152922957801, 629, 2583)

#### alphe = 0.001
(0.5460423915463721, 2.0814560755329556, 0.740232593493463, 19477, 26312)

(0.19492309164982197, 0.7416286640663764, 0.26442121564072785, 683, 2583)

#### alphe = 0.01
(0.4150651017581139, 1.442769560939586, 0.5826999087868653, 15332, 26312)

(0.12570731087455866, 0.4413920715704383, 0.17576461478900504, 454, 2583)

In [164]:
from tqdm import tqdm_notebook

In [167]:
sgd_list = [SGDClassifier(loss="modified_huber", penalty="l2", n_jobs=-1, alpha=0.001) for _ in range(1999)]

for i in tqdm_notebook(range(1999)):
    sgd = sgd_list[i]
    y_tag = tag_index(i)
    sgd.fit(word_tfidf, y_tag) 
    joblib.dump(clf, 'sgd_classifier_v2/sgd_{}.pkl'.format(i)) 




In [168]:
%%time
predict_list_v2 = predict(sgd_list, 217360, word_tfidf=X_test)



  0%|          | 0/1999 [00:00<?, ?it/s][A[A

  0%|          | 1/1999 [00:00<14:21,  2.32it/s][A[A

  0%|          | 2/1999 [00:00<11:55,  2.79it/s][A[A

  0%|          | 3/1999 [00:00<10:37,  3.13it/s][A[A

  0%|          | 4/1999 [00:01<09:41,  3.43it/s][A[A

  0%|          | 5/1999 [00:01<09:05,  3.66it/s][A[A

  0%|          | 6/1999 [00:01<08:51,  3.75it/s][A[A

  0%|          | 7/1999 [00:01<07:59,  4.15it/s][A[A

  0%|          | 8/1999 [00:01<07:37,  4.36it/s][A[A

  0%|          | 9/1999 [00:02<07:15,  4.57it/s][A[A

  1%|          | 10/1999 [00:02<06:59,  4.74it/s][A[A

  1%|          | 11/1999 [00:02<06:50,  4.84it/s][A[A

  1%|          | 12/1999 [00:02<07:14,  4.57it/s][A[A

  1%|          | 13/1999 [00:02<07:07,  4.64it/s][A[A

  1%|          | 14/1999 [00:03<06:46,  4.88it/s][A[A

  1%|          | 15/1999 [00:03<06:38,  4.98it/s][A[A

  1%|          | 16/1999 [00:03<06:31,  5.06it/s][A[A

  1%|          | 17/1999 [00:03<06:27,  5.11it/

CPU times: user 5min 2s, sys: 1min 38s, total: 6min 40s
Wall time: 8min 7s


[A[A

In [169]:
%%time
result_v2 = transform_predict(predict_list_v2)

CPU times: user 6min 17s, sys: 2min 25s, total: 8min 42s
Wall time: 11min 14s


In [171]:
result_v2 = [str(test_questions.id[i])+','+','.join(v) for i, v in enumerate(result_v2)]
result_v2[0]
# '6215603645409872328,6006627476560013656,-5872443091340192918,4610596224687453206,2339809570377332086,-7506384235581390893'

'6215603645409872328,6006627476560013656,-5872443091340192918,2339809570377332086,-8132909213241034354,7476760589625268543'

In [172]:
pd.Series(result_v2).to_csv('sgd_result_v2.csv', header=False, index=False, sep=' ')