In [17]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import  CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from nltk.corpus import movie_reviews



In [3]:
def create_pipeline(vectorizer, transformer, classifier):
    return Pipeline(
        [("vectorizer", vectorizer),
         ("transformer", transformer),
         ("classifier", classifier)
         ]
    )

def get_X_y():
    X = []
    for field in ("neg", "pos"):
        tmp = movie_reviews.fileids(field)
        X.extend([" ". join(movie_reviews.words(fileids=[f])) for f in tmp])
    y = [0] * 1000 + [1] * 1000
    return X, y

In [51]:
data = pd.read_json('../data/train.json', orient='records', lines=True, encoding="utf-8")
data.text = data.text.str.replace("Достоинства:", "").replace("Недостатки:", "")
data["is_pos"] =((data.rating == 5)).astype(int)
data.head()

Unnamed: 0,text,rating,is_pos
0,"1) отличный дизайн, мой синий корпус смотритя...",4,0
1,"Качество связи,сборки.Удобство qwerty клавиат...",5,1
2,Могу сказать сразу. Телефон шикарен. 1)Его ди...,5,1
3,-качественная сборка -QWERTY-клавиатура -гром...,5,1
4,Купил вчера. во-первых это полная клавиатура....,5,1


In [52]:
pipe = create_pipeline(
    CountVectorizer(),
    TfidfTransformer(),
    CalibratedClassifierCV()
)

In [56]:
%%time
pipe.fit(data.text, data.is_pos);

Wall time: 1.17 s


Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('transformer',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('classifier',
                 CalibratedClassifierCV(base_estimator=None, cv='warn',
                                        method='sigmoid'))],
 

In [57]:
accuracy_score(data.is_pos, pipe.predict(data.text))

0.9842157842157843

In [58]:
test = ["плохо"]
pipe.predict(test), pipe.predict_proba(test)

(array([0]), array([[0.9572328, 0.0427672]]))

In [59]:
test = ["хороший продукт"]
pipe.predict(test)[0], pipe.predict_proba(test)

(1, array([[0.39120193, 0.60879807]]))

In [60]:
test = ["хороший продукт, но есть недостатки"]
pipe.predict(test)[0], pipe.predict_proba(test)

(1, array([[0.24729265, 0.75270735]]))

In [61]:
joblib.dump(pipe, '../data/pipeline_model.pkl')

['../data/pipeline_model.pkl']