In [2]:
# Import necessary libraries and functions
import numpy as np
import pandas as pd
import pickle
import bs4
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import warnings

In [3]:
# Read data, parsed from Yandex.Market in the categories "Samsung", "Nokia" and "qwerty"
df_n = pd.read_csv('train.negative.csv', sep='\n')
df_p = pd.read_csv('train.positive.csv', sep='\n')
df = df_n.append(df_p)
# The sample has small size but balanced (78 + 76)
print("The number of negative reviews in the sample: {neg}".format(neg=len(df_n)))
print("The number of positive reviews in the sample {pos}".format(pos=len(df_p)))

traindata = list(df.review)
trainlabels = [0]*len(df_n) + [1]*len(df_p)

The number of negative reviews in the sample: 78
The number of positive reviews in the sample 76


In [9]:
# The simplest model on literal trigrams and the LinearSVC-classifier.
# For Russian text get good results!
def create_simple_model(params=None):
    tfidf_wordngrams = TfidfVectorizer(ngram_range=(1,3),
                                       analyzer='char_wb',
                                       min_df=2,
                                       stop_words=None,
                                       smooth_idf=True,
                                       sublinear_tf=False,
                                       binary=True,
                                       norm="l2")
    
    clf = LinearSVC()
    pipeline = Pipeline([('vect', tfidf_wordngrams), ('clf', clf)])
    if params:
        pipeline.set_params(**params)
    return pipeline

In [10]:
# Create and train a model!
model = create_simple_model()
model.fit(traindata, trainlabels)

Pipeline(steps=[('vect', TfidfVectorizer(analyzer='char_wb', binary=True, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [11]:
# Save the result in pkl
from sklearn.externals import joblib
joblib.dump(model, 'ClasModel.pkl') 

['ClasModel.pkl']