# Amazon Fine Food Reviews Analysis

In [36]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn import metrics
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm_notebook as tqdm
import os
from pathlib import Path

from bs4 import BeautifulSoup
from prettytable import PrettyTable
from sklearn.model_selection import train_test_split

# Reading Data

In [2]:
path = Path('data')
list(iter(path.iterdir()))

[PosixPath('data/database.sqlite'),
 PosixPath('data/GoogleNews-vectors-negative300.bin.gz'),
 PosixPath('data/hashes.txt'),
 PosixPath('data/Reviews.csv'),
 PosixPath('data/w2v-tfidf'),
 PosixPath('data/w2vtfidf')]

In [4]:
con = sqlite3.connect(path/'database.sqlite')
df = pd.read_sql_query(""" SELECT * FROM reviews """, con)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


## Exploratory analysis

In [5]:
df.shape

(568454, 10)

In [6]:
sns.set_style('whitegrid')

In [7]:
def sentiment_score(x):
    if x > 3:
        return 'positive'
    elif x < 3:
        return 'negetive'
    else:
        return 'neutral'

df['Sentiment'] = df.Score.map(sentiment_score)
print("Number of data points in our data", df.shape)
df.head()

Number of data points in our data (568454, 11)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Sentiment
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,positive
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,negetive
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,positive
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,negetive
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,positive


In [13]:
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, con) 

def partition(x):
    if x < 3:
        return 0
    return 1

actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative
print("Number of data points in our data", filtered_data.shape)
filtered_data.head(3)

Number of data points in our data (525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


## Text Preprocessing.

Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>

After which we collect the words used to describe positive and negative reviews

### Data Cleaning: Deduplication

In [14]:
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')


In [15]:
final = sorted_data.drop_duplicates(subset={'UserId', 'ProfileName', 'Time', 'Text'}, keep='first', inplace=False)
final.shape

(364173, 10)

In [16]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [17]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(364171, 10)


1    307061
0     57110
Name: Score, dtype: int64

In [21]:
final = final.sort_values(['Time'], ascending=False, axis=0)

In [23]:
final = final.head(100000)
final.sample(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
68355,74347,B004CFJ140,A22VBA3G1L6I53,Dinah Lee,0,0,1,1342483200,Happy cats!,All three cats and the dog LOVE these! It's w...
496267,536527,B000LKUVXU,A20LEW28TNSY0K,"Justin Harvey ""Justus""",0,0,1,1333152000,"Great drink, horrible price amazon.",Blue Sky Juiced Energy is the only energy drin...
46496,50580,B0000DGG22,A33GIBE391Q4B8,Bruno,0,0,1,1346198400,Great Customer Service!,Initially purchased this plant about a month a...
339879,367719,B003B3OOPA,A1N9FBBZWZ2ADQ,Iasonis,1,1,1,1341360000,Great product,I use this as a body lotion as well as in lip ...
404993,437962,B001RJ1FAW,A22H0CDHVYMA7B,CottonBallGirl,0,0,1,1343001600,Emerald Harmony Deluxe Fruit,Emerald Harmony Deluxe Fruit I shared with fam...


In [24]:
final.shape

(100000, 10)

In [25]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [27]:
stemmer = SnowballStemmer('english')
le=WordNetLemmatizer()

In [28]:
# Combining all the above
preprocessed_reviews = []
preprocessed_reviews_stem = []
preprocessed_reviews_lema = []
# tqdm is for printing the status bar
for sentence in tqdm(final['Text'].values):
    sentence = re.sub(r"http\S+", "", sentence)
    sentence = BeautifulSoup(sentence, 'lxml').get_text()
    sentence = decontracted(sentence)
    sentence = re.sub("\S*\d\S*", "", sentence).strip()
    sentence = re.sub('[^A-Za-z]+', ' ', sentence)
    
    sentence = re.sub('<.*?>', ' ', sentence)
    sentence = re.sub(r'[?|!|\'|"|#]', r'', sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ', sentence)
    # https://gist.github.com/sebleier/554280
    sentence= sentence.split()
    sentence_norm = ' '.join(e.lower() for e in sentence if e.lower() not in stopwords)
    sentence_stem = ' '.join(stemmer.stem(e.lower()) for e in sentence if e.lower() not in stopwords)
    sentence_lema = ' '.join(le.lemmatize(e.lower()) for e in sentence if e.lower() not in stopwords)
    preprocessed_reviews.append(sentence_norm.strip())
    preprocessed_reviews_stem.append(sentence_stem.strip())
    preprocessed_reviews_lema.append(sentence_lema.strip())
    

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




In [30]:
preprocessed_reviews[200]

'got treats last week already gone week wish bigger bags not case side expensive not matter boys love anyway hope find work soon able give cats better food deserve'

In [31]:
preprocessed_reviews_stem[200]

'got treat last week alreadi gone week wish bigger bag not case side expens not matter boy love anyway hope find work soon abl give cat better food deserv'

In [32]:
preprocessed_reviews_lema[200]

'got treat last week already gone week wish bigger bag not case side expensive not matter boy love anyway hope find work soon able give cat better food deserve'

In [33]:
len(preprocessed_reviews_lema)

100000

In [35]:
final['CleanedReview'] = preprocessed_reviews_lema
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedReview
355171,384161,B000EVWQZW,A2PCNXBSKCABG5,Whit,0,0,1,1351209600,Versatile Mix,This mix makes a good bread or can also be use...,mix make good bread also used make pop over ne...
494746,534876,B003ZT25P6,A34ZMGNQ61MPWC,Tjmaxx,0,0,1,1351209600,Yummy!!!,This product is the closest I've come to findi...,product closest come finding pumpkin spice lat...
456497,493568,B0028GWGY2,A1U2QG2PO7TY9X,"Linda Beck ""Booker""",0,0,1,1351209600,wonderful tea,I love this tea. It is so refreshing and easil...,love tea refreshing easily brewed shared frien...
457048,494183,B007IEGR9O,A8M34GMW5GF6D,urlovejess,0,0,1,1351209600,girls bettlejuice costume,Fit perfectly. Size is true. Expected exactly ...,fit perfectly size true expected exactly descr...
319733,346131,B004TJF3BE,A2TZKSY1ZWPOU9,wejani,0,0,1,1351209600,Great Hot Cider!!!,It is hard to find much of anything sugarfree ...,hard find much anything sugarfree really taste...


In [37]:
X = final['CleanedReview']
Y = final['Score']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=0)
X_tr, X_cv, Y_tr, Y_cv = train_test_split(X_train, Y_train, test_size=.2, random_state=0)

In [39]:
print('X_train, Y_train', X_train.shape, Y_train.shape)
print('X_test, Y_test', X_test.shape, Y_test.shape)
print('X_tr, Y_tr', X_tr.shape, Y_tr.shape)
print('X_cv, Y_cv', X_cv.shape, Y_cv.shape)

X_train, Y_train (80000,) (80000,)
X_test, Y_test (20000,) (20000,)
X_tr, Y_tr (64000,) (64000,)
X_cv, Y_cv (16000,) (16000,)


In [43]:
vectorizer = CountVectorizer(min_df=10,ngram_range=(1,4))
X_train_bow = vectorizer.fit_transform(X_tr)
X_cv_bow = vectorizer.transform(X_cv)
X_test_bow = vectorizer.transform(X_test)
X_train_bow.shape, X_cv_bow.shape, X_test_bow.shape

((64000, 42908), (16000, 42908), (20000, 42908))

In [46]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,4), min_df=10)
X_train_tfidf = tf_idf_vect.fit(X_tr)
print("some sample features(unique words in the corpus)",X_train_tfidf.get_feature_names()[0:10])
X_train_tfidf = tf_idf_vect.transform(X_tr)
X_cv_tfidf = tf_idf_vect.transform(X_cv)
X_test_tfidf = tf_idf_vect.transform(X_test)

print(X_train_tfidf.shape, X_cv_tfidf.shape, X_test_tfidf.shape)
print("the number of unique words including both unigrams and bigrams ", X_train_tfidf.get_shape())

(64000, 42908) (16000, 42908) (20000, 42908)
the number of unique words including both unigrams and bigrams  (64000, 42908)


In [40]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn import model_selection
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [47]:
def RF_Classifier(X_train,X_cv,Y_train,Y_cv):
    pred_cv = []
    pred_train = []
    depths = [2, 3, 5, 8, 10, 20]
    estimators = [100, 200, 300, 400, 500]
    for d in depths:
        for e in estimators:
            clf = RandomForestClassifier(n_estimators=e, max_depth=d, n_jobs = -1, class_weight='balanced')
            clf.fit(X_train,Y_train)
            probs = clf.predict_proba(X_cv)
            prob = clf.predict_proba(X_train)
            probs = probs[:,1]
            prob = prob[:,1]
            auc_score_cv = roc_auc_score(Y_cv,probs)
            auc_score_train = roc_auc_score(Y_train,prob)
            pred_cv.append(auc_score_cv)
            pred_train.append(auc_score_train)
    pred_train = np.array(pred_train)
    pred_train = pred_train.reshape(len(depths), len(estimators))
    plt.figure(figsize=(10, 5))
    sns.heatmap(pred_train,annot=True, cmap=cmap, fmt=".3f", xticklabels=estimators,yticklabels=depths)
    plt.xlabel('Estimators')
    plt.ylabel('Depths')
    plt.show()
    print("-"*30, "AUC Score for CV data", "-"*30)
    pred_cv = np.array(pred_cv)
    pred_cv = pred_cv.reshape(len(depths),len(estimators))
    plt.figure(figsize=(10,5))
    sns.heatmap(pred_cv, annot=True, cmap=cmap, fmt=".3f", xticklabels=estimators, yticklabels=depths)
    plt.xlabel('Estimators')
    plt.ylabel('Depths')
    plt.show()

In [48]:
RF_Classifier(X_train_bow, X_cv_bow, Y_tr, Y_cv)

NameError: name 'cmap' is not defined

<Figure size 720x360 with 0 Axes>