В этой тетрадке я сравниваю на кросс-валидации простые классификаторы на tf-idf фичах, без какого-либо файнтюнинга параметров.

In [1]:
import sys 
sys.path.insert(0, '../')

In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.kernel_approximation import AdditiveChi2Sampler, RBFSampler


from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from params import SEED
from params import N_SPLITS

In [3]:
train = pd.read_json('../data/json/train.json', orient='records', lines=True)
#test = pd.read_json('../data/json/test.json', orient='records', lines=True)
#valid = pd.read_json('../data/json/valid.json', orient='records', lines=True)

In [4]:
# конвертируем чтобы избежать лишних ворнингов 
train['price'] = train['price'].astype('float')

In [5]:
n_cats = len(train['category_id'].unique())
n_cats

54

Здесь мы применяем стандартный TfidfVectorizer  к лемматизировнному тексту, и шкалируем числовые фичи

In [6]:
column_trans = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(tokenizer=lambda x:x, lowercase=False, ngram_range=(1,3)), 'lemmatized'), 
        ('scale_jacs', MinMaxScaler(), [x for x in train.columns if 'cat_' in x] + ['price'])
    ])


In [7]:
pipe_lsvc = make_pipeline(column_trans, LinearSVC())
scores_lsvc = cross_val_score(pipe_lsvc, train, train['category_id'],
                              cv=StratifiedKFold(n_splits=N_SPLITS, random_state=SEED))

In [8]:
scores_lsvc.mean()

0.892235269447626

In [11]:
# попробуем тоже самое но без числовых фичей.
column_trans_2 = ColumnTransformer(
    [
        ('tfidf', TfidfVectorizer(tokenizer=lambda x:x, lowercase=False, ngram_range=(1,3)), 'lemmatized'), 
        #('scaler', MinMaxScaler(), [x for x in train.columns if 'cat_' in x] + ['price'])
    ])

In [None]:
pipe_lsvc_2 = make_pipeline(column_trans_2, LinearSVC())
scores_lsvc_2 = cross_val_score(pipe_lsvc_2, train, train['category_id'],
                                cv=StratifiedKFold(n_splits=N_SPLITS, random_state=SEED))

In [None]:
scores_lsvc_2.mean()

0.8923425190815888

In [None]:
# Попробуем добавить аппроксимацию отображения фичей, которое делают нелинейные svm с помощью kernel trick

In [7]:
pipe_lsvc_kernel = make_pipeline(column_trans, AdditiveChi2Sampler(sample_steps=2), LinearSVC())
scores_lsvc_kernel = cross_val_score(pipe_lsvc_kernel, train, train['category_id'],
                                     cv=StratifiedKFold(n_splits=N_SPLITS, random_state=SEED))

In [8]:
scores_lsvc_kernel.mean()

0.8892833816595356

In [12]:
pipe_lsvc_rbf = make_pipeline(column_trans_2, RBFSampler(), LinearSVC())
scores_lsvc_rbf = cross_val_score(pipe_lsvc_rbf, train, train['category_id'],
                                  cv=StratifiedKFold(n_splits=N_SPLITS, random_state=SEED),
                                  n_jobs=2
                                 )

In [13]:
scores_lsvc_rbf.mean()

0.087371975119175

Также попробуем классическую логистическую регрессию, и просто оптимизировать hinge-loss(как у линейного svc) стохастическим градиентым спском

In [14]:
logreg = LogisticRegression(solver='saga', multi_class='multinomial', n_jobs=2, random_state=SEED)
sgd = SGDClassifier( penalty='elasticnet', random_state=SEED, n_jobs=2, tol=1e-3, early_stopping=True)

In [15]:
#
pipe_logreg = make_pipeline(column_trans, logreg)
scores_logreg = cross_val_score(pipe_logreg, train, train['category_id'],
                                cv=StratifiedKFold(n_splits=N_SPLITS, random_state=SEED),
                                n_jobs=2
                               )

In [16]:
scores_logreg.mean()

0.8693606226786059

In [17]:
# 
pipe_sgd = make_pipeline(column_trans, sgd)
scores_sgd = cross_val_score(pipe_sgd, train, train['category_id'],
                             cv=StratifiedKFold(n_splits=N_SPLITS, random_state=SEED),
                             n_jobs=2,
                            )

In [18]:
scores_sgd.mean()

0.77721113288173