In [33]:
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import  TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from nltk.stem import PorterStemmer
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from nltk.corpus import movie_reviews

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [32]:
def create_pipeline(vectorizer, transformer, classifier):
    return Pipeline(
        [("vectorizer", vectorizer),
         ("transformer", transformer),
         ("classifier", classifier)
         ]
    )

def get_X_y():
    X = []
    for field in ("neg", "pos"):
        tmp = movie_reviews.fileids(field)
        X.extend([" ". join(movie_reviews.words(fileids=[f])) for f in tmp])
    y = [0] * 1000 + [1] * 1000
    return X, y

In [15]:
data = pd.read_csv("../../data/products_sentiment_train.tsv", sep='\t', header=None, names=["text", "y"])
pred_data = pd.read_csv("../../data/products_sentiment_test.tsv", sep="\t")
data.head() 

Unnamed: 0,text,y
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [31]:
pred_data.head()

Unnamed: 0,Id,text
0,0,"so , why the small digital elph , rather than ..."
1,1,3/4 way through the first disk we played on it...
2,2,better for the zen micro is outlook compatibil...
3,3,6 . play gameboy color games on it with goboy .
4,4,"likewise , i 've heard norton 2004 professiona..."


In [27]:
svc = create_pipeline(
    CountVectorizer(min_df=1, ngram_range=(1, 5), max_df=0.9, stop_words=None),
    TfidfTransformer(),
    CalibratedClassifierCV(LinearSVC(max_iter=500, loss='hinge', C=1.9, tol=0.001, random_state=777))
)

In [28]:
svc.fit(data.text, data.y);

In [30]:
accuracy_score(data.y, svc.predict(data.text))

1.0

In [41]:
test = ["bad"]
svc.predict(test)[0], svc.predict_proba(test).max()

(0, 0.6975234133091822)

In [77]:
joblib.dump(svc, '../pipeline_model.pkl')

['../pipeline_model.pkl']

### Плохая точность на негативный отзыва

In [34]:
X, y = get_X_y()

In [62]:
all_data = pd.concat([data, pd.DataFrame({"text": X, "y": y})])

In [64]:
new_pipe =  create_pipeline(
    CountVectorizer(min_df=1, ngram_range=(1, 5), max_df=0.9, stop_words=None),
    TfidfTransformer(),
    CalibratedClassifierCV(LinearSVC(max_iter=500, loss='hinge', C=1.9, tol=0.001, random_state=777))
)

In [65]:
new_pipe.fit(all_data.text, all_data.y);

In [68]:
accuracy_score(data.y, new_pipe.predict(data.text))

1.0

In [74]:
text = ["bad"]
new_pipe.predict(text), new_pipe.predict_proba(text)

(array([0], dtype=int64), array([[9.99438954e-01, 5.61045645e-04]]))

In [78]:
joblib.dump(new_pipe, '../pipeline_model.pkl')

['../pipeline_model.pkl']

### Итоговый файл модели получился тяжелый (300+ мб), но обучение на фильмах даёт лучший результат для негативного отзыва, для ревью отправлю файл с небольшим весом модели.