In [1]:
import numpy
import pandas as pd
import json
import random
import numpy as np

## Summary Statistics

In [None]:
reviews = []
with open('reviews_Movies_and_TV_5.json') as json_file: 
    for rec in json_file:
        dic = json.loads(rec)
        reviews.append(dic)

In [None]:
# Use only the first 500,000 records for faster computation
random.seed(123)
random.shuffle(reviews)
reviews = reviews[:500000]

In [None]:
review = []
label = []

In [None]:
for rev in reviews:
    review.append(rev['reviewText'])
    label.append(rev['overall'])

In [None]:
review_df = pd.DataFrame({"review":review, "label":label})

In [None]:
print("Number of documents is %s"%len(review_df))

In [None]:
print("Number of labels is %s"%len(review_df.label.unique()))

In [None]:
print("Label distribution is as follows: ")
distribution = review_df.groupby('label')['review'].nunique().reset_index()
distribution['proportion'] = distribution['review'] / len(review_df)
print(distribution)

In [None]:
print("Average word length in a review is %s"%(np.mean([len(text.split(' ')) for text in review_df['review']])))

## Logistic Regression

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import multiprocessing

In [None]:
stop_words = set(stopwords.words('english')) 

In [None]:
def process_review(text):
    clean_rev = [w.lower() for w in word_tokenize(text) if w not in stop_words and w.isalpha()]
    return ' '.join(clean_rev)

In [None]:
clean_rev = [process_review(i) for i in review]

In [None]:
# pool = multiprocessing.Pool(multiprocessing.cpu_count())

In [None]:
# result = pool.map(process_review, rev_sample)

In [None]:
review_df['cleaned_review'] = clean_rev

In [None]:
review_df.head()

In [None]:
review_df = review_df.drop(columns=['review'])

In [None]:
review_df.to_csv('cleaned.csv')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [4]:
review_df = pd.read_csv('cleaned.csv')

In [None]:
train, test = train_test_split(review_df[['cleaned_review', 'label']].dropna(), random_state = 123)

In [None]:
train_x = train['cleaned_review'].values
train_y = train['label'].values
test_x = test['cleaned_review'].values
test_y = test['label'].values

In [None]:
def tf_idf(train, test, max_df=0.95, ngram=(1,1)):
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_df=max_df, ngram_range=ngram)
    tfidf_vectorizer.fit_transform(train)
    train_feature = tfidf_vectorizer.transform(train)
    test_feature = tfidf_vectorizer.transform(test)
    return train_feature, test_feature

In [None]:
train_x_tfidf, test_x_tfidf = tf_idf(train_x, test_x)

In [None]:
lr_1gram = LogisticRegression(solver='liblinear', random_state=123, C=5, penalty='l1', max_iter=100)
model = lr_1gram.fit(train_x_tfidf,train_y)

In [None]:
pred = model.predict(test_x_tfidf)

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np

cm = confusion_matrix(test_y, pred)
recall = np.diag(cm) / np.sum(cm, axis = 1)
precision = np.diag(cm) / np.sum(cm, axis = 0)

f1 = 2*recall*precision/(recall+precision)

(unique, counts) = np.unique(test_y, return_counts=True)

In [None]:
print("Micro-recall is: %s"%(sum(counts*recall)/sum(counts)))

In [None]:
print("Micro-precision is %s"%(sum(counts*precision)/sum(counts)))

In [None]:
print("Micro-F1 score is %s"%(sum(counts*f1)/sum(counts)))

In [None]:
print("Overall Accuracy is %s"%(sum(pred==test_y)/len(pred)))