In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import os
import re

In [6]:
train_file = open('full_train.txt', encoding="utf8")
test_file = open('full_test.txt', encoding="utf8")
reviews_train = []

for line in train_file:
    reviews_train.append(line.strip())
    
reviews_test = []

for line in test_file:
    reviews_test.append(line.strip())

In [8]:
import re

repl_with_no_space = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
repl_with_space = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    
    reviews = [repl_with_no_space.sub("", line.lower()) for line in reviews]
    reviews = [repl_with_space.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    

Accuracy for C=0.01: 0.8752
Accuracy for C=0.05: 0.88432
Accuracy for C=0.25: 0.88176
Accuracy for C=0.5: 0.87904
Accuracy for C=1: 0.87776


In [13]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(X_test)))

Final Accuracy: 0.88128


In [15]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)
    

('excellent', 0.9288811358313244)
('perfect', 0.7934640037109209)
('great', 0.6750409129828066)
('amazing', 0.6160397025780329)
('superb', 0.6063967735019598)
('worst', -1.3679782040343622)
('waste', -1.1684450859169844)
('awful', -1.027700168599744)
('poorly', -0.8748317155865719)
('boring', -0.8587249241455075)
