In [46]:
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import cross_val_score ,StratifiedKFold,GridSearchCV,RandomizedSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
import nltk
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectFromModel

### Загрузка данных

In [2]:
train = pd.read_csv("products_sentiment_train.tsv", sep = "\t", header = None, names = ["text", "label"])
test =  pd.read_csv("products_sentiment_test.tsv", sep = "\t", index_col = 0 )

In [3]:
train.head()

Unnamed: 0,text,label
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [4]:
train.text.values

array(['2 . take around 10,000 640x480 pictures .',
       'i downloaded a trial version of computer associates ez firewall and antivirus and fell in love with a computer security system all over again .',
       'the wrt54g plus the hga7t is a perfect solution if you need wireless coverage in a wider area or for a hard-walled house as was my case .',
       ..., 'overall i like it . ',
       'i began taking pics as soon as i got this camera and am amazed at the quality of photos i have took simply by using the auto mode . ',
       "even after reading some of the instructions , it 's still hard to figure out . "],
      dtype=object)

In [5]:
negfeats =  list(train[train.label == 0]["text"].values)
posfeats =  list(train[train.label == 1]["text"].values)
texts = train.text.values
y = train.label.values

In [18]:
texts

array(['2 . take around 10,000 640x480 pictures .',
       'i downloaded a trial version of computer associates ez firewall and antivirus and fell in love with a computer security system all over again .',
       'the wrt54g plus the hga7t is a perfect solution if you need wireless coverage in a wider area or for a hard-walled house as was my case .',
       ..., 'overall i like it . ',
       'i began taking pics as soon as i got this camera and am amazed at the quality of photos i have took simply by using the auto mode . ',
       "even after reading some of the instructions , it 's still hard to figure out . "],
      dtype=object)

In [6]:
print(len(posfeats))
print(len(negfeats))

1274
726


### Базовое решение

In [7]:
def make_pipeline(vectorizer, transformer, classifier):
    return Pipeline([
            ('vectorizer', vectorizer),
            ('transformer', transformer),
            ('classifier', classifier)
        ])

In [8]:
pipe_base = make_pipeline(CountVectorizer(), TfidfTransformer(),LogisticRegression(solver = "liblinear"))
result_base = cross_val_score(pipe_base, texts, y, cv =5, scoring="accuracy")
print(result_base.mean())

0.7655000000000001


### Выбор оптимального решения

### Линейные классификаторы

In [9]:
for name, clf in [('LogisticRegression', LogisticRegression), 
                  ('LinearSVC', LinearSVC), 
                  ('SGDClassifier', SGDClassifier)]:
    score = cross_val_score(make_pipeline(CountVectorizer(), TfidfTransformer(), clf(random_state=12)), texts, y, cv=5).mean()
    print(name,": ", score)

LogisticRegression :  0.766
LinearSVC :  0.7689999999999999
SGDClassifier :  0.7545


In [58]:
def make_estimator(classifier, params_grid, data, labels):
    pipeline = make_pipeline(CountVectorizer(), TfidfTransformer(), classifier)
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring='accuracy', cv=5, random_state=12, n_iter=100,n_jobs=-1)
    grid_cv.fit(data, labels)
    return grid_cv

In [59]:
# Парсметры для каждой модели

params_grid_lr = {
    'classifier__C': np.arange(0.1, 2, 0.1),
    'classifier__max_iter': np.arange(50, 500, 50),
    'classifier__solver': ['lbfgs', 'liblinear', 'sag']
}
params_grid_lsvc = {
    'classifier__loss': ['hinge', 'squared_hinge'], 
    'classifier__max_iter': np.arange(100, 1000, 50),
    'classifier__tol': [1e-5, 1e-4, 1e-3],
    'classifier__C': np.arange(0.1, 2, 0.1)
}
params_grid_sgdc = {
    'classifier__loss': ['log', 'hinge', 'modified_huber'], 
    'classifier__penalty':  ['l1', 'l2', 'elasticnet'], 
    'classifier__max_iter': np.arange(100, 1000, 50),
    'classifier__tol': np.arange(1e-5, 1e-3, 1e-5),
}


params_grid_vectorizer = {
    'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],
    'vectorizer__min_df' : [1, 10, 20], 
    'vectorizer__ngram_range' : [(1, x) for x in range(1,7)],
    'vectorizer__stop_words' : [ None, 'english']
}

In [60]:
%%time
grid_search_lr = make_estimator(LogisticRegression(), 
                                {**params_grid_vectorizer, **params_grid_lr}, texts, y)
print("LogisticRegression: %.4f" % grid_search_lr.best_score_ )


LogisticRegression: 0.7585
Wall time: 38.2 s


In [61]:

%%time
grid_search_lsvc = make_estimator(LinearSVC(random_state=12), 
                                  {**params_grid_vectorizer, **params_grid_lsvc}, texts, y)
print("LinearSVC: %.4f" % grid_search_lsvc.best_score_ )

LinearSVC: 0.7905
Wall time: 23.5 s


In [26]:

%%time
grid_search_sgdc = make_estimator(SGDClassifier(random_state=12), 
                                  {**params_grid_vectorizer, **params_grid_sgdc},  texts, y)
print("SGDClassifier: %.4f" % grid_search_sgdc.best_score_ )


SGDClassifier: 0.7920
Wall time: 25.5 s


In [27]:

print("Лучшая линейная модель: SGDClassifier, Accuracy = %.4f"% grid_search_sgdc.best_score_)
print("Параметры модели:",  grid_search_sgdc.best_estimator_[2])
print("Парамтры предобработки:", grid_search_sgdc.best_estimator_[0])

Лучшая линейная модель: SGDClassifier, Accuracy = 0.7920
Параметры модели: SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=200, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=12, shuffle=True, tol=0.00093,
              validation_fraction=0.1, verbose=0, warm_start=False)
Парамтры предобработки: CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.85, max_features=None, min_df=1,
                ngram_range=(1, 4), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


### Нелинейные алгоритмы

In [None]:
for name, clf in [('GradientBoostingClassifier', GradientBoostingClassifier), 
                  ('RandomForestClassifier', RandomForestClassifier)]:
    score = cross_val_score(make_pipeline(CountVectorizer(), TfidfTransformer(), clf(max_depth = 10,random_state=12)), texts, y, cv=5).mean()
    print(name,": ", score)

In [None]:

params_grid_gbt = {
    "classifier__learning_rate": [0.01, 0.05,0.1, 0.15, 0.2],
    "classifier__min_samples_split": np.linspace(0.05, 0.25, 5),
    "classifier__max_depth":[3,5,8, 12],
    "classifier__max_features":["log2","sqrt"],
    "classifier__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "classifier__n_estimators": np.arange(100, 1000,100)
}
params_grid_rf = {
    'classifier__n_estimators':np.arange(100, 1000,100),
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__max_depth': np.arange(2,16,2),
    'classifier__criterion': ['gini', 'entropy'],

}

In [None]:
%%time
grid_search_gbt = make_estimator(GradientBoostingClassifier(random_state=12), 
                                {**params_grid_vectorizer, **params_grid_gbt}, texts, y)
print("GradientBoostingClassifier: %.4f" % grid_search_gbt.best_score_ )


In [None]:
%%time
grid_search_rf = make_estimator(RandomForestClassifier(), 
                                {**params_grid_vectorizer, **params_grid_rf}, texts, y)
print("RandomForestClassifier: %.4f" % grid_search_rf.best_score_ )


### Попробуем генерацию новых признаков с помощью метода главных компонент

In [45]:
TfidfTransformer().fit_transform(CountVectorizer(ngram_range=(1,4)).fit_transform(texts))

<2000x74561 sparse matrix of type '<class 'numpy.int64'>'
	with 114831 stored elements in Compressed Sparse Row format>

In [79]:
def make_estimator(classifier, params_grid, data, labels):
    pipeline =     Pipeline([
            ('vectorizer', CountVectorizer()),
            ('transformer', TfidfTransformer()),
            ("pca", TruncatedSVD()),
            ('classifier', classifier)
        ])
    grid_cv = RandomizedSearchCV(pipeline, params_grid, scoring='accuracy', cv=5, random_state=12, n_iter=100,n_jobs=-1, verbose =10)
    grid_cv.fit(data, labels)
    return grid_cv

In [None]:
pca_grid = {
    "pca__n_components" : [100, 250, 500,1000,2000,5000]
}

In [None]:
print("LinearSVC: %.4f" % grid_search_lsvc.best_estimator_ )

In [None]:

params_grid_lsvc = {
    'classifier__loss': ['hinge', 'squared_hinge'], 
    'classifier__max_iter': np.arange(100, 600, 100),
    'classifier__tol': [1e-5, 1e-4, 1e-3],
    'classifier__C': np.arange(0.1, 2, 0.1)
}
params_grid_sgdc = {
    'classifier__loss': ['log', 'hinge', 'modified_huber'], 
    'classifier__penalty':  ['l1', 'l2', 'elasticnet'], 
    'classifier__max_iter': np.arange(100, 500, 100),
    'classifier__tol': np.arange(1e-5, 1e-3, 1e-5),
}


params_grid_vectorizer = {
    'vectorizer__max_df' : [0.85, 0.9, 0.95, 1.0],
    'vectorizer__min_df' : [1, 10, 20], 
    'vectorizer__ngram_range' : [(1, x) for x in range(1,6)],
    'vectorizer__stop_words' : [ None, 'english']
}

In [None]:
grid_search_lsvc = make_estimator(LinearSVC(random_state=12), 
                                  {**params_grid_vectorizer, **params_grid_lsvc, **pca_grid}, texts, y)
print("LinearSVC: %.4f" % grid_search_lsvc.best_score_ )

### Итоговая модель

In [None]:
df

In [None]:
arr = grid_search_sgdc.best_estimator_.predict(test.text.values)
df = pd.Series(arr).to_frame()
df.columns = [ "y"]
df.index.name = "Id"
df.to_csv("submission.csv")

In [None]:
df

### Результат