In [1]:

import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score ,StratifiedKFold,GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
import nltk
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD


### Загрузка данных

In [2]:
train = pd.read_csv("products_sentiment_train.tsv", sep = "\t", header = None, names = ["text", "label"])
test =  pd.read_csv("products_sentiment_test.tsv", sep = "\t", index_col = 0 )

In [3]:
train.head()

Unnamed: 0,text,label
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [7]:
negfeats =  list(train[train.label == 0]["text"].values)
posfeats =  list(train[train.label == 1]["text"].values)
texts = train.text.values
y = train.label.values

In [8]:
print(len(posfeats))
print(len(negfeats))

1274
726


### Базовое решение

In [4]:
def make_pipeline(vectorizer, transformer, classifier):
    return Pipeline([
            ('vectorizer', vectorizer),
            ('transformer', transformer),
            ('classifier', classifier)
        ])

NameError: name 'texts' is not defined

In [10]:
pipe_base = make_pipeline(CountVectorizer(), TfidfTransformer(), LogisticRegression(solver = "liblinear"))
result_base = cross_val_score(pipe_base, texts, y, cv =5, scoring="accuracy")
print(result_base.mean())

0.7655000000000001


### Выбор оптимального решения

### Линейные классификаторы

In [11]:
for name, clf in [('LogisticRegression', LogisticRegression), 
                  ('LinearSVC', LinearSVC), 
                  ('SGDClassifier', SGDClassifier)]:
    score = cross_val_score(make_pipeline(CountVectorizer(), TfidfTransformer(), clf(random_state=12)), texts, y, cv=5).mean()
    print(name,": ", score)

LogisticRegression :  0.766
LinearSVC :  0.7689999999999999
SGDClassifier :  0.7545


In [137]:
#конструктор ценочной функции
def make_estimator(classifier, params_grid, data, labels):
    pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), classifier)
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring='accuracy', cv=5, random_state=12, n_iter=100,n_jobs=-1)
    grid_cv.fit(data, labels)
    return grid_cv

In [3]:
# Параметры для каждой модели

params_grid_lr = {
    'classifier__C': np.arange(0.1, 2, 0.1),
    'classifier__max_iter': np.arange(50, 500, 50),
    'classifier__solver': ['lbfgs', 'liblinear', 'sag']
}
params_grid_lsvc = {
    'classifier__loss': ['hinge', 'squared_hinge'], 
    'classifier__max_iter': np.arange(100, 1000, 50),
    'classifier__tol': [1e-5, 1e-4, 1e-3],
    'classifier__C': np.arange(0.1, 2, 0.1)
}
params_grid_sgdc = {
    'classifier__loss': ['log', 'hinge', 'modified_huber'], 
    'classifier__penalty':  ['l1', 'l2', 'elasticnet'], 
    'classifier__max_iter': np.arange(100, 1000, 50),
    'classifier__tol': np.arange(1e-5, 1e-3, 1e-5),
}

# Параетры векторизации
params_grid_vectorizer = {
    'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],
    'vectorizer__min_df' : [1, 10, 20], 
    'vectorizer__ngram_range' : [(1, x) for x in range(1,7)],
    'vectorizer__stop_words' : [ None, 'english']
}

In [14]:
%%time
grid_search_lr = make_estimator(LogisticRegression(), 
                                {**params_grid_vectorizer, **params_grid_lr}, texts, y)
print("LogisticRegression: %.4f" % grid_search_lr.best_score_ )


LogisticRegression: 0.7585
Wall time: 36.1 s


In [15]:

%%time
grid_search_lsvc = make_estimator(LinearSVC(random_state=12), 
                                  {**params_grid_vectorizer, **params_grid_lsvc}, texts, y)
print("LinearSVC: %.4f" % grid_search_lsvc.best_score_ )

LinearSVC: 0.7905
Wall time: 22.9 s


In [16]:

%%time
grid_search_sgdc = make_estimator(SGDClassifier(random_state=12), 
                                  {**params_grid_vectorizer, **params_grid_sgdc},  texts, y)
print("SGDClassifier: %.4f" % grid_search_sgdc.best_score_ )


SGDClassifier: 0.7920
Wall time: 24.7 s


In [17]:

print("Лучшая линейная модель: SGDClassifier, Accuracy = %.4f"% grid_search_sgdc.best_score_)
print("Параметры модели:",  grid_search_sgdc.best_estimator_[2])
print("Парамтры предобработки:", grid_search_sgdc.best_estimator_[0])

Лучшая линейная модель: SGDClassifier, Accuracy = 0.7920
Параметры модели: SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=200, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=12, shuffle=True, tol=0.00093,
              validation_fraction=0.1, verbose=0, warm_start=False)
Парамтры предобработки: CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.85, max_features=None, min_df=1,
                ngram_range=(1, 4), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


<p  style="color:red"> UPD </p>
Впоследствие окажется что это и есть лучшая модель, все более сложные модели, рассмотренные ниже, будут предсказывать с худшим качеством.

### Нелинейные алгоритмы

In [132]:
for name, clf in [('GradientBoostingClassifier', GradientBoostingClassifier), 
                  ('RandomForestClassifier', RandomForestClassifier)]:
    score = cross_val_score(make_pipeline(CountVectorizer(), TfidfTransformer(), clf(max_depth = 10,random_state=12)), texts, y, cv=5).mean()
    print(name,": ", score)

GradientBoostingClassifier :  0.7325000000000002
RandomForestClassifier :  0.6415


In [133]:

params_grid_gbt = {
    "classifier__learning_rate": [0.01, 0.05,0.1, 0.15, 0.2],
    "classifier__min_samples_split": np.linspace(0.05, 0.25, 5),
    "classifier__max_depth":[3,5,8, 12],
    "classifier__max_features":["log2","sqrt"],
    "classifier__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "classifier__n_estimators": np.arange(100, 1000,100)
}
params_grid_rf = {
    'classifier__n_estimators':np.arange(100, 1000,100),
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth': np.arange(2,16,2),
    'classifier__criterion': ['gini', 'entropy'],

}

In [138]:
%%time
grid_search_gbt = make_estimator(GradientBoostingClassifier(random_state=12), 
                                {**params_grid_vectorizer, **params_grid_gbt}, texts, y)
print("GradientBoostingClassifier: %.4f" % grid_search_gbt.best_score_ )


GradientBoostingClassifier: 0.7585
Wall time: 2min 7s


In [139]:
%%time
grid_search_rf = make_estimator(RandomForestClassifier(), 
                                {**params_grid_vectorizer, **params_grid_rf}, texts, y)
print("RandomForestClassifier: %.4f" % grid_search_rf.best_score_ )


RandomForestClassifier: 0.6640
Wall time: 1min 38s


#### Выод: линейные алгоритмы показали себя лучше

### Попробуем генерацию новых признаков с помощью метода главных компонент

In [18]:
TfidfTransformer().fit_transform(CountVectorizer(ngram_range=(1,4)).fit_transform(texts))

<2000x74561 sparse matrix of type '<class 'numpy.float64'>'
	with 114831 stored elements in Compressed Sparse Row format>

In [19]:
# Переопределим оценочную функцию
def make_estimator(classifier, params_grid, data, labels):
    pipeline =     Pipeline([
            ('vectorizer', CountVectorizer()),
            ('transformer', TfidfTransformer()),
            ("pca", TruncatedSVD()),
            ('classifier', classifier)
        ])
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring='accuracy', cv=5, random_state=12, n_iter=100,n_jobs=-1, verbose =10)
    grid_cv.fit(data, labels)
    return grid_cv

In [48]:
pca_grid = {
    "pca__n_components" : [10,100, 150, 200,250, 300, 350],
    "pca__n_iter":[7, 10,15]
}

In [35]:

params_grid_lsvc = {
    'classifier__loss': ['hinge', 'squared_hinge'], 
    'classifier__max_iter': np.arange(100, 600, 100),
    'classifier__tol': [1e-5, 1e-4, 1e-3],
    'classifier__C': np.arange(0.1, 2, 0.1)
}
params_grid_sgdc = {
    'classifier__loss': ['log', 'hinge', 'modified_huber'], 
    'classifier__penalty':  ['l1', 'l2', 'elasticnet'], 
    'classifier__max_iter': np.arange(100, 500, 100),
    'classifier__tol': np.arange(1e-5, 1e-3, 1e-5),
}


params_grid_vectorizer = {
    'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],
    'vectorizer__min_df' : [1, 10, 20], 
    'vectorizer__ngram_range' : [(1, x) for x in range(1,6)],
    'vectorizer__stop_words' : [ None, 'english']
}



In [36]:
grid_search_lsvc.best_estimator_[2]

TruncatedSVD(algorithm='randomized', n_components=100, n_iter=5,
             random_state=None, tol=0.0)

In [41]:
grid_search_lsvc = make_estimator(LinearSVC(random_state=12), 
                                  {**params_grid_vectorizer, **params_grid_lsvc, **pca_grid}, texts, y)
print("LinearSVC: %.4f" % grid_search_lsvc.best_score_ )
print(grid_search_lsvc.best_estimator_[2])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   46.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  4

LinearSVC: 0.7665
TruncatedSVD(algorithm='randomized', n_components=200, n_iter=7,
             random_state=None, tol=0.0)


In [49]:
grid_search_lr = make_estimator(LogisticRegression(), 
                                  {**params_grid_vectorizer, **params_grid_lr, **pca_grid}, texts, y)
print("LinearRegression: %.4f" % grid_search_lr.best_score_ )
print("TruncatedSVD:", grid_search_lr.best_estimator_[2])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1106s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too slow (6.8692s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  23 tasks      | elapsed:   20.4s
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  71 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  84 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 127 tasks      | elapsed:  2.5min
[Parallel(n_jo

LinearRegression: 0.7610
TruncatedSVD: TruncatedSVD(algorithm='randomized', n_components=150, n_iter=10,
             random_state=None, tol=0.0)


In [45]:

%%time
grid_search_sgdc = make_estimator(SGDClassifier(random_state=12), 
                                  {**params_grid_vectorizer, **params_grid_sgdc,**pca_grid},  texts, y)
print("SGDClassifier: %.4f" % grid_search_sgdc.best_score_ )


Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1812s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Batch computation too slow (6.7118s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  31 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  99 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:  2.8min
[Parallel(n_jo

SGDClassifier: 0.7730
Wall time: 9min 46s


#### Вывод: Отображение пространство преобразованных признаков с помощью метода главных компонент не приводит к улучшению результата, а ухудшает его.


### Итоговая модель

Лучше всего себя показала модель стохастическго градиентного спуска. Уточним параметры этой модели для получения наилучшего качества.

In [102]:
# определим функцию заново, так как вышеприведённый эксперимент не удался
def make_estimator(classifier, params_grid, data, labels):
    pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), classifier)
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring='accuracy', cv=5, random_state=12, n_iter=1000,n_jobs=-1,)
    grid_cv.fit(data, labels)
    return grid_cv

In [95]:
grid_search_sgdc.best_estimator_

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.985,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 5), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabu...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.05, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.5,
                               learning_rate='optimal', loss='hinge',
                               max_iter=300, 

In [124]:
params_grid_sgdc = {
    'classifier__loss': ['log', 'hinge', 'modified_huber'], 
    'classifier__penalty':  ['l1', 'l2', 'elasticnet'], 
    'classifier__max_iter': np.arange(100, 1000, 25),
    'classifier__tol': np.arange(1e-5, 1e-3, 5*1e-6),
    'classifier__l1_ratio': [0, 0.05, 0.1, 0.2, 0.5],
    "classifier__epsilon":[0.5,0.1,0.15,0.2]
}

params_grid_vectorizer = {
    'vectorizer__max_df' : np.arange(0.94,0.99,0.005),
    'vectorizer__min_df' : [1, 2], 
    'vectorizer__ngram_range' : [(1, x) for x in range(1,6)],
    'vectorizer__stop_words' : [ None, 'english']
}

In [125]:
%%time
grid_search_sgdc = make_estimator(SGDClassifier(random_state=None), 
                                  {**params_grid_vectorizer, **params_grid_sgdc},  texts, y)
print("SGDClassifier: %.4f" % grid_search_sgdc.best_score_ )


SGDClassifier: 0.7935
Wall time: 4min 21s


In [128]:
print("SGDClassifier: %.4f" % grid_search_sgdc.best_score_ )

SGDClassifier: 0.7935


In [129]:
grid_search_sgdc.best_estimator_

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.965,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabu...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.2, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.5,
                               learning_rate='optimal', loss='hinge',
                               max_iter=925, n

In [130]:
arr = grid_search_sgdc.best_estimator_.predict(test.text.values)
df = pd.Series(arr).to_frame()
df.columns = [ "y"]
df.index.name = "Id"
df.to_csv("submission.csv")

### Результат
![title](result.png)