In [54]:
import pyprind
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from nltk.stem.porter import PorterStemmer
import nltk
import os
import re
warnings.filterwarnings('ignore')

In [None]:
basepath = "/Users/nikita/Documents/Study/Internship/Datasets/aclImdb"
labels ={'pos': 1, 'neg' : 0}
pbar = pyprind.ProgBar(50000)
data = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r',encoding='utf-8') as infile:
                txt = infile.read()
            data = data.append([[txt, labels[l]]],ignore_index=True)
            pbar.update()
data.columns = ['review', 'sentiment']

In [None]:
np.random.seed(0)
data = data.reindex(np.random.permutation(data.index))
data.to_csv('movie.csv', encoding='utf-8')

In [None]:
data = pd.read_csv('movie.csv', encoding='utf-8')
data.head(3)

In [None]:
count = CountVectorizer()
docs = np.array(['The sun is shining', 
                'The weather is sweet',
                'The sun is shuning, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [None]:
print(count.vocabulary_)

In [None]:
print(bag.toarray())

In [None]:
tfidf = TfidfTransformer(use_idf=True, norm='l2',smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(bag).toarray())

In [None]:
data.loc[10,'review'][-50:]

In [None]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '',text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)
    text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-',''))
    return text

In [None]:
preprocessor(data.loc[10,'review'][-50:])

In [None]:
data['review'] = data['review'].apply(preprocessor)

In [None]:
def tokenizer(text):
    return text.split()
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
import nltk
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
X_train = data.loc[:25000,'review'].values
y_train = data.loc[:25000, 'sentiment'].values
X_test = data.loc[25000:, 'review'].values
y_test = data.loc[25000:, 'sentiment'].values

In [61]:
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'clf__penalty': ['l1','l2'],
              'clf__C': [1.0,10.0,100.0]},
             {'vect__ngram_range':[(1,1)],
              'vect__stop_words':[stop,None],
              'vect__tokenizer': [tokenizer, tokenizer_porter],
              'vect__use_idf': [False],
              'vect__norm': [None],
              'clf__penalty': ['l1','l2'],
              'clf__C': [1.0,10.0,100.0]}]
lr_tfidf = Pipeline([('vect',tfidf),('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy',cv=5,verbose=1,n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 163.5min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=False,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [60]:
gs_lr_tfidf.estimator.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__norm', 'vect__preprocessor', 'vect__smooth_idf', 'vect__stop_words', 'vect__strip_accents', 'vect__sublinear_tf', 'vect__token_pattern', 'vect__tokenizer', 'vect__use_idf', 'vect__vocabulary', 'clf__C', 'clf__class_weight', 'clf__dual', 'clf__fit_intercept', 'clf__intercept_scaling', 'clf__l1_ratio', 'clf__max_iter', 'clf__multi_class', 'clf__n_jobs', 'clf__penalty', 'clf__random_state', 'clf__solver', 'clf__tol', 'clf__verbose', 'clf__warm_start'])

In [62]:
print('Наилучший набор параметров: %s ' % gs_lr_tfidf.best_params_)

Наилучший набор параметров: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x7f903508ab90>} 


In [63]:
print('Правильность при перекрестной проверке: %.3f' % gs_lr_tfidf.best_score_)

Правильность при перекрестной проверке: 0.893


In [64]:
clf = gs_lr_tfidf.best_estimator_
print('Правильность при испытании: %.3f' % clf.score(X_test, y_test))

Правильность при испытании: 0.900
