In [1]:
import numpy as np
import pandas as pd
import re

In [77]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [2]:
from nltk.corpus import movie_reviews

In [5]:
def clean(text):
    text = text.lower()
    text = re.sub('(https?://[^\s]+)', '', text)
    text = re.sub('@[^\s]+', '', text)
    text = re.sub('rt', '', text)
    text = re.sub('[^a-zA-Zа-яА-Я]+', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

In [5]:
def prep(text):
    text = (re.sub('\n', '', text)).strip()
    return text

In [3]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [4]:
negfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in negids]
posfeats = [' '.join(movie_reviews.words(fileids=[f])) for f in posids]

In [5]:
texts = negfeats + posfeats
labels = [0] * (len(texts)//2) + [1] * (len(texts)//2)

In [12]:
cvb_data = [clean(t) for t in texts]

In [38]:
%%time
with open('2000english.txt', 'w', encoding='utf-8') as f:
    for text in cvb_data:
        print(text, file=f)

Wall time: 28 ms


In [3]:
with open('2000english.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

In [6]:
text = [prep(i) for i in text]

In [7]:
df = pd.DataFrame()
df['text'] = text
labels = np.array([0] * (len(text)//2) + [1] * (len(text)//2))
df['labels'] = labels
df = shuffle(df) 

In [8]:
vectorizer = TfidfVectorizer(max_features=100000)
X = vectorizer.fit_transform(df["text"])
y = df['labels']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### LogisticRegression

In [10]:
log = LogisticRegression()
log.fit(X_train, y_train)
y_train_pred = log.predict(X_train)
y_test_pred = log.predict(X_test)



In [11]:
precision_score(y_test, y_test_pred), recall_score(y_test, y_test_pred), f1_score(y_test, y_test_pred)

(0.8, 0.7875647668393783, 0.7937336814621411)

In [12]:
accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.8025

precision_score, recall_score, f1_score

In [70]:
(cross_val_score(log, X, y, cv=10, scoring='precision').mean(), cross_val_score(log, X, y, cv=10, scoring='recall').mean(), cross_val_score(log, X, y, cv=10, scoring='f1').mean())



(0.8348538967406547, 0.8290000000000001, 0.8313753974278866)

accuracy_score

In [75]:
cross_val_score(log, X, y, cv=10).mean()



0.8314999999999999

#### KNN

In [69]:
knn = KNeighborsClassifier()

precision_score, recall_score, f1_score

In [60]:
(cross_val_score(knn, X, y, cv=10, scoring='precision').mean(), cross_val_score(knn, X, y, cv=10, scoring='recall').mean(), cross_val_score(knn, X, y, cv=10, scoring='f1').mean())

(0.5518913696488149, 0.9349999999999999, 0.6938818950326627)

accuracy_score

In [67]:
cross_val_score(knn, X, y, cv=10).mean()

0.5875000000000001

#### MultinomialNB

In [76]:
clf = MultinomialNB()

precision_score, recall_score, f1_score

In [66]:
(cross_val_score(clf, X, y, cv=10, scoring='precision').mean(), cross_val_score(clf, X, y, cv=10, scoring='recall').mean(), cross_val_score(clf, X, y, cv=10, scoring='f1').mean())

(0.8651148244111996, 0.7509999999999999, 0.8038052879577903)

accuracy_score

In [68]:
cross_val_score(clf, X, y, cv=10).mean()

0.8164999999999999