In [1]:
import numpy
import pandas as pd
import json
import random
import numpy as np

## Summary Statistics

In [2]:
reviews = []
with open('reviews_Movies_and_TV_5.json') as json_file: 
    for rec in json_file:
        dic = json.loads(rec)
        reviews.append(dic)

In [3]:
# Use only the first 500,000 records for faster computation
random.seed(123)
random.shuffle(reviews)
reviews = reviews[:500000]

In [4]:
review = []
label = []

In [5]:
for rev in reviews:
    review.append(rev['reviewText'])
    label.append(rev['overall'])

In [6]:
review_df = pd.DataFrame({"review":review, "label":label})

In [7]:
print("Number of documents is %s"%len(review_df))

Number of documents is 500000


In [8]:
print("Number of labels is %s"%len(review_df.label.unique()))

Number of labels is 5


In [9]:
print("Label distribution is as follows: ")
distribution = review_df.groupby('label')['review'].nunique().reset_index()
distribution['proportion'] = distribution['review'] / len(review_df)
print(distribution)

Label distribution is as follows: 
   label  review  proportion
0    1.0   30757    0.061514
1    2.0   30068    0.060136
2    3.0   59488    0.118976
3    4.0  113145    0.226290
4    5.0  266381    0.532762


In [10]:
print("Average word length in a review is %s"%(np.mean([len(text.split(' ')) for text in review_df['review']])))

Average word length in a review is 164.213994


## Logistic Regression

In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import multiprocessing

In [12]:
stop_words = set(stopwords.words('english')) 

In [19]:
def process_review(text):
    clean_rev = [w.lower() for w in word_tokenize(text) if w not in stop_words and w.isalpha()]
    return ' '.join(clean_rev)

In [26]:
clean_rev = [process_review(i) for i in review]

In [22]:
# pool = multiprocessing.Pool(multiprocessing.cpu_count())

In [25]:
# result = pool.map(process_review, rev_sample)

In [31]:
review_df['cleaned_review'] = clean_rev

In [32]:
review_df.head()

Unnamed: 0,review,label,cleaned_review
0,Not the best Hutton/Prentiss movie by far but ...,3.0,not best movie far okay i prefer where the boy...
1,This is a good series for toddlers-- we have m...,5.0,this good series toddlers entire dvd overcomin...
2,I bought this for the wife. I did like this se...,2.0,i bought wife i like series watched wife the s...
3,"In the 1950's, Tv was live. It was theater in...",5.0,in tv live it theater video format the idea vi...
4,Let me begin by saying that the movie itself d...,3.0,let begin saying movie deserves stars having s...


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [35]:
train, test = train_test_split(review_df[['cleaned_review', 'label']].dropna(), random_state = 123)

In [40]:
train_x = train['cleaned_review'].values
train_y = train['label'].values
test_x = test['cleaned_review'].values
test_y = test['label'].values

In [55]:
def tf_idf(train, test, max_df=0.95, ngram=(1,1)):
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, max_df=max_df, ngram_range=ngram)
    tfidf_vectorizer.fit_transform(train)
    train_feature = tfidf_vectorizer.transform(train)
    test_feature = tfidf_vectorizer.transform(test)
    return train_feature, test_feature

In [56]:
train_x_tfidf, test_x_tfidf = tf_idf(train_x, test_x)

In [57]:
lr_1gram = LogisticRegression(solver='liblinear', random_state=123, C=5, penalty='l1', max_iter=100)
model = lr_1gram.fit(train_x_tfidf,train_y)

[LibLinear]

In [58]:
pred = model.predict(test_x_tfidf)

In [104]:
from sklearn.metrics import confusion_matrix
import numpy as np

cm = confusion_matrix(test_y, pred)
recall = np.diag(cm) / np.sum(cm, axis = 1)
precision = np.diag(cm) / np.sum(cm, axis = 0)

f1 = 2*recall*precision/(recall+precision)

(unique, counts) = np.unique(test_y, return_counts=True)

In [116]:
print("Micro-recall is: %s"%(sum(counts*recall)/sum(counts)))

Micro-recall is: 0.613504


In [117]:
print("Micro-precision is %s"%(sum(counts*precision)/sum(counts)))

Micro-precision is 0.5763339536759488


In [118]:
print("Micro-F1 score is %s"%(sum(counts*f1)/sum(counts)))

Micro-F1 score is 0.5861154517932219


In [119]:
print("Overall Accuracy is %s"%(sum(pred==test_y)/len(pred)))

Overall Accuracy is 0.613504
