In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
from tqdm import tqdm_notebook

In [2]:
%%time
questions = pd.read_csv('./question_train_word.csv')
questions_topics = questions.topics.apply(lambda s: s.split(','))
questions_titles = questions.title.astype('U').apply(lambda s: s.split(','))

CPU times: user 27.9 s, sys: 7.47 s, total: 35.4 s
Wall time: 37.4 s


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
binarizer = MultiLabelBinarizer()

In [5]:
%time y_train = binarizer.fit_transform(questions_topics)
y_train.shape

CPU times: user 10.5 s, sys: 12.5 s, total: 23 s
Wall time: 30.9 s


(2999967, 1999)

In [14]:
vect = TfidfVectorizer(min_df=6, max_df=0.45)
%time X_train = vect.fit_transform(questions.title.astype('U'))
X_train.shape

CPU times: user 51 s, sys: 1.98 s, total: 52.9 s
Wall time: 53.5 s


(2999967, 118393)

In [6]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

In [17]:
clf = OneVsRestClassifier(SGDClassifier(loss="modified_huber", penalty="l2", n_jobs=-1, n_iter=1, verbose=2), n_jobs=-1)

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
%time clf.fit(X_train[:1000], y_train[:1000])

-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 44.56, NNZs: 211, Bias: -2.044153, T: 1000, Avg. loss: 0.037460
Norm: 30.06, NNZs: 360, Bias: -1.191472, T: 1000, Avg. loss: 0.010160
Total training time: 0.04 seconds.
-- Epoch 1
Total training time: 0.02 seconds.
Norm: 43.74, NNZs: 338, Bias: -1.846511, T: 1000, Avg. loss: 0.056493
Norm: 34.95, NNZs: 261, Bias: -1.324865, T: 1000, Avg. loss: 0.013221
Total training time: 0.05 seconds.
Total training time: 0.06 seconds.
-- Epoch 1
Norm: 30.33, NNZs: 283, Bias: -1.309556, T: 1000, Avg. loss: 0.009578
-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 32.42, NNZs: 365, Bias: -1.252711, T: 1000, Avg. loss: 0.012558
Total training time: 0.02 seconds.
Norm: 56.83, NNZs: 353, Bias: -2.233577, T: 1000, Avg. loss: 0.055633
Total training time: 0.02 seconds.
Norm: 26.31, NNZs: 403, Bias: -0.948487, T: 1000, Avg. loss: 0.008307
-- Epoch 1
-- Epoch 1
Norm: 37.24, NNZs: 334, Bias: -1.553156, T: 1000, Avg. loss: 0.021242
Total training time: 0.02 seconds.
Total training 

OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', n_iter=1, n_jobs=-1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=2, warm_start=False),
          n_jobs=-1)

In [22]:
%time z_train = clf.predict(X_train[:1000])
z_train = binarizer.inverse_transform(z_train)
z_train[:10]

CPU times: user 282 ms, sys: 240 ms, total: 522 ms
Wall time: 715 ms


[('3738968195649774859', '7739004195693774975'),
 ('-3149765934180654494', '-982546890824575042'),
 ('-760432988437306018',),
 ('-548176348047239669',
  '-6758942141122113907',
  '3195914392210930723',
  '732225345006872595'),
 ('-1689319711084901730',
  '-2689200710357900655',
  '3804601920633030746',
  '4195795391451929480',
  '4351331710881888756',
  '4797226510592237555'),
 ('3804601920633030746', '4195795391451929480', '4351331710881888756'),
 ('-8963554618409314978', '-985918886318729368'),
 ('-2063727883907680094',
  '-3171150797683574404',
  '-578156641762363345',
  '-7489973769578614435',
  '3065185060409823831',
  '3587421951489801185',
  '42427966967759255',
  '4619805718368927775',
  '6022205592682717526',
  '7756633728210171144'),
 ('-7844932991499998016', '-9165709055872875620', '5127719909274807567'),
 ('-4115748438709160582',)]

In [24]:
from evaluate import evaluate
evaluate(zip(z_train, questions_topics[:1000]))

(0.6164972854578673, 2.0130110632766725, 0.888652783680408, 2091, 2353)

In [26]:
%time z_train = clf.predict_proba(X_train[:1000])
z_train.shape

CPU times: user 220 ms, sys: 11.5 ms, total: 232 ms
Wall time: 233 ms


(1000, 1999)

In [33]:
def top_five(predict_list):
    return np.array(predict_list).argsort()[-5:][::-1]

In [38]:
topic_ids = binarizer.classes_

In [42]:
def top_five_result(z_train):
    result = []
    for i in range(z_train.shape[0]):
        result.append([topic_ids[t] for t in top_five(z_train[i])])
    return result

In [44]:
z_train_result = top_five_result(z_train)
evaluate(zip(z_train_result, questions_topics[:1000]))

(0.6785290250356127, 2.3238664580482777, 0.9583510412239694, 2255, 2353)

## reduce X_train dimension

In [45]:
from collections import Counter

%time title_word_count = Counter([w for ws in questions_titles for w in ws])
title_word_most_common = title_word_count.most_common()

CPU times: user 9.61 s, sys: 4.54 s, total: 14.1 s
Wall time: 16.8 s


In [46]:
print(len(title_word_most_common))
word_count_serial = pd.Series([i for w, i in title_word_most_common])

324960


In [49]:
print(word_count_serial[word_count_serial>5].count())
print(word_count_serial[word_count_serial>20].count())
print(word_count_serial[word_count_serial>50].count())

119998
58855
33012


In [7]:
vect = TfidfVectorizer(min_df=21, max_df=0.45)
%time X_train = vect.fit_transform(questions.title.astype('U'))
X_train.shape

CPU times: user 48 s, sys: 2.76 s, total: 50.8 s
Wall time: 51.6 s


(2999967, 57933)

In [51]:
%time clf.fit(X_train[:1000], y_train[:1000])

-- Epoch 1
Norm: 44.19, NNZs: 197, Bias: -1.989077, T: 1000, Avg. loss: 0.039766
-- Epoch 1
Total training time: 0.06 seconds.
Norm: 29.85, NNZs: 358, Bias: -1.183106, T: 1000, Avg. loss: 0.010066
-- Epoch 1
-- Epoch 1
Norm: 43.49, NNZs: 334, Bias: -1.809099, T: 1000, Avg. loss: 0.056950
Norm: 35.17, NNZs: 231, Bias: -1.318667, T: 1000, Avg. loss: 0.013431
Total training time: 0.03 seconds.
Total training time: 0.03 seconds.
Total training time: 0.03 seconds.
-- Epoch 1
Norm: 30.47, NNZs: 264, Bias: -1.311522, T: 1000, Avg. loss: 0.009677
Total training time: 0.02 seconds.
-- Epoch 1
Norm: 32.37, NNZs: 330, Bias: -1.243895, T: 1000, Avg. loss: 0.012509
Total training time: 0.03 seconds.
-- Epoch 1
Norm: 34.13, NNZs: 390, Bias: -1.509268, T: 1000, Avg. loss: 0.016296
Total training time: 0.01 seconds.
-- Epoch 1
Norm: 37.75, NNZs: 307, Bias: -1.519353, T: 1000, Avg. loss: 0.021654
-- Epoch 1
-- Epoch 1
Norm: 51.17, NNZs: 371, Bias: -1.963298, T: 1000, Avg. loss: 0.071528
Total training 

OneVsRestClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', n_iter=1, n_jobs=-1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=2, warm_start=False),
          n_jobs=-1)

## killed!

In [None]:
%time clf.fit(X_train, y_train)