In [1]:
import pandas as pd
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import string
from nltk.stem.porter import PorterStemmer
import re

# Explore Data

In [2]:
reviews = pd.read_json('Health_and_Personal_Care_5.json', lines=True)

In [3]:
reviews['Date'] = pd.to_datetime(reviews['unixReviewTime'],unit='s')

In [4]:
reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Date
0,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5,Handy little gadget,1294185600,"01 5, 2011",2011-01-05
1,AHKSURW85PJUE,159985130X,"AZ buyer ""AZ buyer""","[1, 1]",I would recommend this for a travel magnifier ...,4,Small & may need to encourage battery,1329523200,"02 18, 2012",2012-02-18
2,A38RMU1Y5TDP9,159985130X,"Bob Tobias ""Robert Tobias""","[75, 77]",What I liked was the quality of the lens and t...,4,Very good but not great,1275955200,"06 8, 2010",2010-06-08
3,A1XZUG7DFXXOS4,159985130X,Cat lover,"[56, 60]",Love the Great point light pocket magnifier! ...,4,great addition to your purse,1202428800,"02 8, 2008",2008-02-08
4,A1MS3M7M7AM13X,159985130X,Cricketoes,"[1, 1]",This is very nice. You pull out on the magnifi...,5,Very nice and convenient.,1313452800,"08 16, 2011",2011-08-16


In [5]:
reviews.shape

(346355, 10)

In [6]:
print("unique products: ", reviews['asin'].nunique())
print("unique users: ", reviews['reviewerID'].nunique())
print("unique reviews: ", reviews['reviewerID'].nunique())
print("timeframe: ", reviews['Date'].min(), " - ", reviews['Date'].max())

unique products:  18534
unique users:  38609
unique reviews:  38609
timeframe:  2000-12-09 00:00:00  -  2014-07-23 00:00:00


In [7]:
# Produts with most reviews
reviews.groupby(['asin']).size().reset_index(name='counts').sort_values('counts',ascending=False).head()

Unnamed: 0,asin,counts
10706,B0037KMI0U,1089
5308,B0010JLMO8,767
8105,B001KXZ808,699
12260,B0049LUI9O,528
3114,B000GIPJY8,475


In [8]:
# Check NA
reviews.isna().sum()

reviewerID           0
asin                 0
reviewerName      3051
helpful              0
reviewText           0
overall              0
summary              0
unixReviewTime       0
reviewTime           0
Date                 0
dtype: int64

# Data Pre-processing 

In [9]:
# deal with n't
def n_apostrophe_t_handler(document):

    i = 0
    while i < len(document):
        if "n't" in document[i]:        
            # Checks to see if there is a following word after word ending in "n't"
            if (i+1) < len(document):
                document[i+1] = 'not_' + document[i+1]
                document.pop(i)
            else:
                document.pop(i)
        i+=1

    return(document)

In [10]:
# remove numbers
#input_str = ’Box A contains 3 red and 5 white balls, while Box B contains 4 red and 2 blue balls.’
def remove_number(document):
    result = re.sub(r'\d+', '', document)
    return result

In [11]:
# convert text to lower case 
review_text = reviews["reviewText"].str.lower()
print("original: ",review_text[7],"\n")

# remove numbers
review_text = review_text.apply(remove_number)
print("numbers: ",review_text[7],"\n")

# words Tokenization
review_text = review_text.apply(word_tokenize)
print("tokenization: ",review_text[7],"\n")

# deal with negation
review_text = review_text.apply(n_apostrophe_t_handler)
print("negation: ",review_text[7],"\n")

# remove punctuation
punctuations = list(string.punctuation)
review_text = review_text.apply(lambda x: 
           [i.strip("".join(punctuations)) for i in x if i not in punctuations])
print("punctuation: ", review_text[7],"\n")

# remove stop words 
stop_words=set(stopwords.words("english"))
review_text = review_text.apply(lambda x: 
                             [item for item in x if item not in stop_words])
print("stop words: ",review_text[7],"\n")

# word stemming
stemmer = PorterStemmer()
review_text = review_text.apply(lambda x: [stemmer.stem(y) for y in x])
print("stemming:  ",review_text[7],"\n")

# lemmatizer
lemmatizer = WordNetLemmatizer()
review_text = review_text.apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
print("lemmatizer:  ",review_text[7],"\n")

# remove empty string
for i in range(len(review_text)):
    review_text[i] = [x for x in review_text[i] if x]
print("remove empty:  ",review_text[7],"\n")


original:  we bought one for road trips and trying to interpret maps without having to strain our eyes. really nice design, good tactile feel. i couldn't figure out where the batteries were, sent lightwedge customer service an email and received a response within 24 hours. if you need one i'd recommend this one. 

numbers:  we bought one for road trips and trying to interpret maps without having to strain our eyes. really nice design, good tactile feel. i couldn't figure out where the batteries were, sent lightwedge customer service an email and received a response within  hours. if you need one i'd recommend this one. 

tokenization:  ['we', 'bought', 'one', 'for', 'road', 'trips', 'and', 'trying', 'to', 'interpret', 'maps', 'without', 'having', 'to', 'strain', 'our', 'eyes', '.', 'really', 'nice', 'design', ',', 'good', 'tactile', 'feel', '.', 'i', 'could', "n't", 'figure', 'out', 'where', 'the', 'batteries', 'were', ',', 'sent', 'lightwedge', 'customer', 'service', 'an', 'email', 'a

# Text vectorization

## Most frequent words

In [12]:
review_clean = review_text

In [13]:
from nltk import FreqDist
vec=[]
for i in range(len(review_clean)):
    for j in range(len(review_clean[i])):
        vec.append(review_clean[i][j]) 
freq = FreqDist(vec)

In [14]:
most_freq = pd.DataFrame(freq.most_common(500),columns=['Word','Frequency'])
most_freq.head()

Unnamed: 0,Word,Frequency
0,use,276456
1,product,173446
2,one,144143
3,like,143951
4,work,135202


In [15]:
vocabulary = most_freq['Word'].tolist()

## Clean String

In [16]:
def listToString(s):  
    # initialize an empty string 
    str1 = ""  
    # traverse in the string   
    for ele in s:  
        str1 += ele + " " 
    # return string   
    return str1  


In [17]:
review_clean_str = review_clean.apply(listToString)

## Bag of Words

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=True, vocabulary = vocabulary)
cv.fit(review_clean_str)
X_bow = cv.transform(review_clean_str)

In [19]:
X_bow.toarray().shape

(346355, 500)

In [20]:
bag_of_words_df = pd.DataFrame(X_bow.toarray(), index = reviews['reviewerID'], columns = vocabulary)
bag_of_words_df.head(10)

Unnamed: 0_level_0,use,product,one,like,work,get,take,good,would,great,...,stand,claim,measur,simpli,gone,difficult,absorb,odor,famili,idea
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ALC5GH8CAMAI7,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
AHKSURW85PJUE,0,0,1,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
A38RMU1Y5TDP9,1,0,0,1,1,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
A1XZUG7DFXXOS4,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
A1MS3M7M7AM13X,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
AXO4PQU0XG3TG,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A28X0LT2100RL1,1,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
A1VUSWRVN8SJA8,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
A1JQDCX4LDKBZ3,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
A3RNRXOM5J2C93,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF-IDF

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfidf_model = TfidfVectorizer(ngram_range = (1, 1), vocabulary = vocabulary).fit(review_clean_str)

In [23]:
result = pd.DataFrame({'word': tfidf_model.get_feature_names(), 'tfidf': list(tfidf_model.idf_)})

In [24]:
result.head()

Unnamed: 0,word,tfidf
0,use,1.821326
1,product,2.179951
2,one,2.303672
3,like,2.24732
4,work,2.236394


## n-Grams

In [25]:
tfidf_ngram = TfidfVectorizer(ngram_range = (1, 2), max_features = 1000).fit(review_clean_str)

In [26]:
result2 = pd.DataFrame({'word': tfidf_ngram.get_feature_names(), 'tfidf': list(tfidf_ngram.idf_)})

In [27]:
result2.head()

Unnamed: 0,word,tfidf
0,abl,4.296929
1,absolut,5.054542
2,absorb,5.095142
3,accur,5.230855
4,acid,5.198424


# Sentiment Analysis

In [28]:
reviews['review_clean_list'] = review_clean
reviews['review_clean_str'] = review_clean_str

In [29]:
# classify sentiment into positive and negative ones
reviews['sentiment'] = ''
reviews['sentiment'][reviews['overall'] >= 4] = 'positive'
#test['sentiment'][test['polarity'] == 0] = 'neutral'
reviews['sentiment'][reviews['overall'] < 4] = 'negative'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [30]:
reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,Date,review_clean_list,review_clean_str,sentiment
0,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5,Handy little gadget,1294185600,"01 5, 2011",2011-01-05,"[great, littl, gadget, around, alreadi, use, l...",great littl gadget around alreadi use look spl...,positive
1,AHKSURW85PJUE,159985130X,"AZ buyer ""AZ buyer""","[1, 1]",I would recommend this for a travel magnifier ...,4,Small & may need to encourage battery,1329523200,"02 18, 2012",2012-02-18,"[would, recommend, travel, magnifi, occasion, ...",would recommend travel magnifi occasion readin...,positive
2,A38RMU1Y5TDP9,159985130X,"Bob Tobias ""Robert Tobias""","[75, 77]",What I liked was the quality of the lens and t...,4,Very good but not great,1275955200,"06 8, 2010",2010-06-08,"[like, qualiti, len, built, light, len, discer...",like qualiti len built light len discern disto...,positive
3,A1XZUG7DFXXOS4,159985130X,Cat lover,"[56, 60]",Love the Great point light pocket magnifier! ...,4,great addition to your purse,1202428800,"02 8, 2008",2008-02-08,"[love, great, point, light, pocket, magnifi, w...",love great point light pocket magnifi work gre...,positive
4,A1MS3M7M7AM13X,159985130X,Cricketoes,"[1, 1]",This is very nice. You pull out on the magnifi...,5,Very nice and convenient.,1313452800,"08 16, 2011",2011-08-16,"[nice, pull, magnifi, want, light, come, slide...",nice pull magnifi want light come slide back w...,positive


In [31]:
# Sentiment 
reviews.groupby('sentiment').reviewText.count()

sentiment
negative     66554
positive    279801
Name: reviewText, dtype: int64

# Modeling

Develop Sentiment Analysis predictive models (binary classification) in Python, using Jupyter Notebook or any other
tool of your choice. Apply LogisticRegression, SVM, RandomForest classification algorithms (you can also choose any
three classification algorithms of your choice). Apply Data Science Process Model as a guide. (5 points)

In [32]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import os
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model, datasets
from sklearn.metrics import confusion_matrix

In [33]:
dummy_y = pd.get_dummies(reviews['sentiment'],drop_first=True)   #reduce one degree of freedom
reviews['dummy_y'] = dummy_y

In [34]:
x = bag_of_words_df
y = reviews['dummy_y']
X_train, X_test, y_train, y_test = train_test_split(x, y ,test_size=0.3, random_state=42)

## Logistics Regression

In [35]:
# The default logistic regression
logistic = linear_model.LogisticRegression()
logistic.fit(X_train,y_train)

y_pred = logistic.predict(X_test)
y_score = logistic.predict_proba(X_test)[:,1]

print('Accuracy of logistic regression classifier on test set: {:.5f}'.format(logistic.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.83157


In [36]:
logistic.get_params

<bound method BaseEstimator.get_params of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)>

In [None]:
# Tune parameters
import warnings
warnings.filterwarnings("ignore")

params = {'C':[0.0001, 1, 100, 1000],
          'max_iter':[1, 10, 100, 500],
          'class_weight':['balanced', None],
          'solver':['liblinear','sag','lbfgs','newton-cg']
         }
lr = LogisticRegression()
grid_search = GridSearchCV(lr, param_grid=params, cv=10)
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
# Best logistic model
best_model_log = LogisticRegression(**grid_search.best_params_)
best_model_log.fit(X_train, y_train)

y_pred = best_model_log.predict(X_test)
y_score = best_model_log.predict_proba(X_test)[:,1]

print('Accuracy of best_model_log classifier on test set: {:.5f}'.format(best_model_log.score(X_test, y_test)))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
#Compute precision, recall, F-measure and support
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
#roc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
def roc(y_test, y_pred, y_score):
    logit_roc_auc = roc_auc_score(y_test, y_pred)
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    plt.figure()
    plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()

In [None]:
roc(y_test, y_pred, y_score)

## RandomForest

In [None]:
# The default Random Forest
rf_class = RandomForestClassifier()
rf_class.fit(X_train,y_train)

y_pred=rf_class.predict(X_test)
y_score = rf_class.predict_proba(X_test)[:,1]

print('Accuracy of RF classifier on test set: {:.5f}'.format(rf_class.score(X_test, y_test)))

In [None]:
#Tune parameters
params = {"max_depth": range(10,50,3),
           "n_estimators": range(10,50,3),
           'criterion' :['gini', 'entropy'],
           'max_features': ['auto', 'sqrt', 'log2']
          }
rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid=params, cv=10)
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
# Best RF 
best_model_rf = RandomForestClassifier(**grid_search.best_params_,)
best_model_rf.fit(X_train, y_train)

y_pred = best_model_rf.predict(X_test)
y_score = best_model_rf.predict_proba(X_test)[:,1]

print('Accuracy of best_model_rf classifier on test set: {:.5f}'.format(best_model_rf.score(X_test, y_test)))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
#Compute precision, recall, F-measure and support
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
roc(y_test, y_pred, y_score)

## SVM

In [None]:
# The default SVM model
from sklearn import svm

svmclf = svm.SVC()
svmclf.fit(X_train, y_train)

y_pred = svmclf.predict(X_test)
y_score = logistic.predict_proba(X_test)[:,1]

print('Accuracy of SVM classifier on test set: {:.5f}'.format(svmclf.score(X_test, y_test)))

In [None]:
#Tune parameters
params = [{'kernel': ['rbf'], 'gamma': [0.001, 0.0001],'C': [1, 10, 100, 1000]},
          {'kernel': ['linear'], 'gamma': [0.001, 0.0001], 'C': [1, 10, 100, 1000]}]
svm = svm.SVC()
grid_search = GridSearchCV(svm, param_grid=params, cv=10)
grid_search.fit(X_train,y_train)

In [None]:
grid_search.best_params_

In [None]:
# Best svm model
from sklearn import svm
best_model_svm = svm.SVC(**grid_search.best_params_,probability=True)
best_model_svm.fit(X_train, y_train)

y_pred = best_model_svm.predict(X_test)
y_score = best_model_svm.predict_proba(X_test)[:,1]

print('Accuracy of best_model_svm classifier on test set: {:.5f}'.format(best_model_svm.score(X_test, y_test)))

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
#Compute precision, recall, F-measure and support
print(classification_report(y_test, y_pred))

In [None]:
roc(y_test, y_pred, y_score)