In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
import numpy as np
import pandas as pd
import os , re
warnings.filterwarnings('ignore')

In [2]:
reviews_train = []
for line in open(r'full_train.txt', 'r',encoding = 'utf-8'):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open(r'full_test.txt', 'r',encoding = 'utf-8'):
    
    reviews_test.append(line.strip())

In [3]:
reviews_train[5]

"This isn't the comedic Robin Williams, nor is it the quirky/insane Robin Williams of recent thriller fame. This is a hybrid of the classic drama without over-dramatization, mixed with Robin's new love of the thriller. But this isn't a thriller, per se. This is more a mystery/suspense vehicle through which Williams attempts to locate a sick boy and his keeper.<br /><br />Also starring Sandra Oh and Rory Culkin, this Suspense Drama plays pretty much like a news report, until William's character gets close to achieving his goal.<br /><br />I must say that I was highly entertained, though this movie fails to teach, guide, inspect, or amuse. It felt more like I was watching a guy (Williams), as he was actually performing the actions, from a third person perspective. In other words, it felt real, and I was able to subscribe to the premise of the story.<br /><br />All in all, it's worth a watch, though it's definitely not Friday/Saturday night fare.<br /><br />It rates a 7.7/10 from...<br />

In [4]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

In [5]:
reviews_train_clean[5]

'this isnt the comedic robin williams nor is it the quirky insane robin williams of recent thriller fame this is a hybrid of the classic drama without over dramatization mixed with robins new love of the thriller but this isnt a thriller per se this is more a mystery suspense vehicle through which williams attempts to locate a sick boy and his keeper also starring sandra oh and rory culkin this suspense drama plays pretty much like a news report until williams character gets close to achieving his goal i must say that i was highly entertained though this movie fails to teach guide inspect or amuse it felt more like i was watching a guy williams as he was actually performing the actions from a third person perspective in other words it felt real and i was able to subscribe to the premise of the story all in all its worth a watch though its definitely not friday saturday night fare it rates a   from the fiend '

In [6]:
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)

In [7]:
target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s"% (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.87568
Accuracy for C=0.05: 0.88144
Accuracy for C=0.25: 0.88272
Accuracy for C=0.5: 0.8792
Accuracy for C=1: 0.87648


In [8]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print(f'Final Accuracy : {accuracy_score(target,final_model.predict(X_test))}')

Final Accuracy : 0.88144


In [9]:
feature_to_coef = {word: coef for word, coef in zip(cv.get_feature_names(), final_model.coef_[0])}

In [10]:
for best_positive in sorted(feature_to_coef.items(),key=lambda x: x[1],reverse=True)[:5]:
    print (best_positive)

('excellent', 0.9283544365774301)
('perfect', 0.794427771773093)
('great', 0.6745552923708332)
('amazing', 0.6164834542986902)
('superb', 0.605591972022723)


In [11]:
for best_negative in sorted(feature_to_coef.items(),key=lambda x: x[1])[:5]:
    print (best_negative)   

('worst', -1.3679897665464298)
('waste', -1.1688808944586007)
('awful', -1.0273337491280006)
('poorly', -0.874802240897175)
('boring', -0.8591221154582859)


In [12]:
print(f'Number of words values are {len(feature_to_coef)}')

Number of words values are 90860


In [13]:
newfile = open('SA Dictionary.txt', 'w',encoding = 'utf-8')

for w,v in feature_to_coef.items() : 
    newfile.write(f'{w},{v}\n')
    
newfile.close()    