<a href="https://colab.research.google.com/github/Komsomolochka/sentiment_analysis/blob/main/sentiment_analysis_week3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv('/content/products_sentiment_train.tsv', header=None, sep="\t", names=['text','label'])

In [None]:
test = pd.read_csv('/content/products_sentiment_test.tsv', header=None, sep="\t", names=['text'])

In [None]:
X_train = train['text']
y_train = train['label']

In [None]:
X_test = test['text']

In [None]:
train['label'].value_counts()

1    1274
0     726
Name: label, dtype: int64

In [None]:
# сбалансируем выборку
X_train = X_train.values.reshape(-1, 1)
ros = RandomOverSampler()
X_res, y_res = ros.fit_resample(X_train, y_train)

In [None]:
X_res = X_res.reshape(1,X_res.shape[0])[0]

In [None]:
# выбираем модель
score_list = []
for vectorizer in [CountVectorizer,TfidfVectorizer]:
  for model in [LogisticRegression, LinearSVC, SGDClassifier]:
    pipeline = Pipeline([('vectorizer', vectorizer()),('classifier', model(max_iter=10000))])
    score = cross_val_score(pipeline, X_res, y_res).mean()
    score_list.append(score)
    print(str(vectorizer) +" - "+str(model))
    print(score)
    print("\n")



<class 'sklearn.feature_extraction.text.CountVectorizer'> - <class 'sklearn.linear_model._logistic.LogisticRegression'>
0.8469694518278825


<class 'sklearn.feature_extraction.text.CountVectorizer'> - <class 'sklearn.svm._classes.LinearSVC'>
0.8477591586732925


<class 'sklearn.feature_extraction.text.CountVectorizer'> - <class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
0.8410832466581917


<class 'sklearn.feature_extraction.text.TfidfVectorizer'> - <class 'sklearn.linear_model._logistic.LogisticRegression'>
0.8234161562463885


<class 'sklearn.feature_extraction.text.TfidfVectorizer'> - <class 'sklearn.svm._classes.LinearSVC'>
0.8493200816672445


<class 'sklearn.feature_extraction.text.TfidfVectorizer'> - <class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
0.8454008243769021




In [None]:
max(score_list)

0.8493200816672445

Лучшая модель: TfidfVectorizer + LinearSVC

In [None]:
# подберем параметры TfidfVectorizer
pipeline = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', LinearSVC(max_iter=10000))])
parameters = {
'vectorizer__max_features':[100, 2000],
'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)],
'vectorizer__stop_words': [None, 'english'],
'classifier__C':np.arange(0.01,100,10)
}

In [None]:
grid_search = GridSearchCV(pipeline, parameters)

In [None]:
grid_search.fit(X_res,y_res)

GridSearchCV(estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                       ('classifier',
                                        LinearSVC(max_iter=10000))]),
             param_grid={'classifier__C': array([1.000e-02, 1.001e+01, 2.001e+01, 3.001e+01, 4.001e+01, 5.001e+01,
       6.001e+01, 7.001e+01, 8.001e+01, 9.001e+01]),
                         'vectorizer__max_features': [100, 2000],
                         'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)],
                         'vectorizer__stop_words': [None, 'english']})

In [None]:
grid_search.best_score_

0.8442289764628838

In [None]:
# обучаем лучшую модель
sub = pd.DataFrame()
pipeline = Pipeline([('vectorizer', TfidfVectorizer()),('classifier', LinearSVC(max_iter=10000))])
best_model = pipeline.fit(X_res, y_res)

In [None]:
y_pred = best_model.predict(test.text.values[1:])
df = pd.Series(y_pred).to_frame()
df.columns = ["y"]
df.index.name = "Id"
df.to_csv("submission.csv")