In [1]:
import sys 
sys.path.insert(0, '../')

In [9]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB


from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

from params import SEED
from params import N_SPLITS

In [24]:
train = pd.read_json('../data/json/train.json', orient='records', lines=True)
#test = pd.read_json('../data/json/test.json', orient='records', lines=True)
valid = pd.read_json('../data/json/valid.json', orient='records', lines=True)

In [4]:
train['price'] = train['price'].astype('float')

In [5]:
n_cats = len(train['category_id'].unique())
n_cats

54

In [12]:
tfidf_jacs_price = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(tokenizer=lambda x:x, lowercase=False, ngram_range=(1,3)), 'tokens'), 
        ('scale_jacs', MinMaxScaler(), [x for x in train.columns if 'cat_' in x]),
        ('scale_price', MinMaxScaler(), ['price']),
    ])

tfidf_price = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(tokenizer=lambda x:x, lowercase=False, ngram_range=(1,3)), 'tokens'), 
        ('scale_price', MinMaxScaler(), ['price']),
    ])

tfidf = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(tokenizer=lambda x:x, lowercase=False, ngram_range=(1,3)), 'tokens'), 
    ])

In [13]:
def test_clf(clf, trans):
    pipe = make_pipeline(trans, clf)

    scores = cross_val_score(pipe,
                             train,
                             train['category_id'],
                             cv=StratifiedKFold(n_splits=N_SPLITS, random_state=SEED)
                            )
    return scores.mean(), scores.std()

In [17]:
clf = SGDClassifier(loss='log', alpha=0.000002, tol = 0.0001, n_jobs=2)
print('logreg with jacs: ', test_clf(clf, tfidf_jacs_price))
print('logreg without jacs: ', test_clf(clf, tfidf_price))
print('logreg with only tf-idf: ', test_clf(clf, tfidf))

logreg with jacs:  (0.8753717167777904, 0.0016602482835245094)
logreg without jacs:  (0.8765131919977754, 0.0014912802974167566)
logreg with only tf-idf:  (0.8765157510008466, 0.0016146828060361212)


In [19]:
clf = SGDClassifier(loss='hinge', alpha=0.000005, tol = 0.001, n_jobs=6)
print('linearSVC with jacs: ', test_clf(clf, tfidf_jacs_price))
print('LinearSVC without jacs: ', test_clf(clf, tfidf_price))
print('LinearSVC with only tf-idf: ', test_clf(clf, tfidf))

linearSVC with jacs:  (0.8859918861895395, 0.0016410106302150124)
LinearSVC without jacs:  (0.8861604036460315, 0.0016092815068130654)
LinearSVC with only tf-idf:  (0.8860071772649654, 0.0017503243890651162)


In [20]:
clf = SGDClassifier(loss='modified_huber', alpha=0.00001, tol = 0.0001,
                     n_jobs=6, learning_rate='constant', eta0=0.01)
print('huber with jacs: ', test_clf(clf, tfidf_jacs_price))
print('huber without jacs: ', test_clf(clf, tfidf_price))
print('huber with only tf-idf: ', test_clf(clf, tfidf))

huber with jacs:  (0.8860608125105329, 0.0017626452894630791)
huber without jacs:  (0.886027616511454, 0.0017421863381085897)
huber with only tf-idf:  (0.8860276147844697, 0.001728609819319614)


Clearly jacs are useless

In [21]:
from sklearn.ensemble import VotingClassifier

In [25]:
clf1 = SGDClassifier(loss='log', alpha=0.000002, tol = 0.0001, n_jobs=6)
clf2 = SGDClassifier(loss='hinge', alpha=0.000005, tol = 0.001, n_jobs=6)
clf3 = SGDClassifier(loss='modified_huber', alpha=0.00001, tol = 0.0001,
                     n_jobs=6, learning_rate='constant', eta0=0.01)
eclf = make_pipeline(tfidf_price, VotingClassifier(estimators=[('lr', clf1),
                                     ('svc', clf2), 
                                     ('mhb', clf3)
                                    ],
                         voting='hard'
                        ))

In [26]:
eclf.fit(train, train['category_id'])

  return self.partial_fit(X, y)


Pipeline(memory=None,
     steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input...0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None))])

In [27]:
accuracy_score(eclf.predict(valid), valid['category_id'])

0.8856635071090048