## Amazon Food reviews Analysis

## Import Necessary Modules 

In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import string
import seaborn as sn
import os
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer  # Bag of words
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve,auc
from nltk.stem.porter import PorterStemmer
import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

In [2]:
os.listdir()

['database.sqlite',
 'Reviews.csv',
 '.DS_Store',
 'Amazon Fine Food Reviews Analysis.ipynb',
 'Amazon Food Reviews Analysis.ipynb',
 'GoogleNews-vectors-negative300.bin',
 '\u200eciteseerx.ist.psu.edu:viewdoc:download?doi=10.1.1.176.6780&rep=rep1&type=pdf.pdf',
 'PID & KF.pdf',
 'SOLS',
 'Amazon Review.ipynb',
 'hashes.txt',
 '.ipynb_checkpoints']

## Get data ready

In [3]:
#Data present in database is extracted and read using pandas and sqlite3 connection

In [4]:
con=sqlite3.connect('database.sqlite')

In [5]:
filtered_data=pd.read_sql_query(""" 
SELECT * FROM Reviews 
WHERE Score!=3
""",con)

In [6]:
filtered_data

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
525809,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
525810,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
525811,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
525812,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [7]:
filtered_data.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [8]:
# the data is filtered such that all the rows containing score = 3 is eliminated
filtered_data.shape

(525814, 10)

In [9]:
data=pd.read_sql_query("""
SELECT * FROM Reviews
""",con)

In [10]:
data.shape

(568454, 10)

In [11]:
data.shape[0]-filtered_data.shape[0]

42640

In [12]:
#Therefore there is loss of 42640 rows

In [13]:
# we should replace score with positive or negative 

In [14]:
def partition(x):
    if x>3:
        return 'positive'
    return 'negative'

In [15]:
# we created a function which returns positive if score>3 else negative i.e score<3

In [16]:
score=filtered_data['Score']
positiveNegative=score.map(partition)
filtered_data['Score']=positiveNegative

In [17]:
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [18]:
## Therefore the score column now consists of text 'positive' or 'negative'

In [19]:
filtered_data.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

## Data Deduplication : Cleaning Data

In [20]:
# there are many duplicates in the database for ex 

In [21]:
dup=pd.read_sql_query("""
SELECT * FROM Reviews WHERE
Score!=3 AND UserId="AR5J8UI46CURR"
""",con)
dup

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [22]:
# tha above data contains multiple duplicates 

In [23]:
#first lets sort the data according to product id
sorted_data=filtered_data.sort_values('ProductId',axis=0,ascending=True)

In [24]:
sorted_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc..."
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...
138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...
138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...


In [25]:
#Now lets drop duplicates in the dataframe
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first',inplace=False)
print("Shape of the dataframe after dropping duplicates :" , final.shape)

Shape of the dataframe after dropping duplicates : (364173, 10)


In [26]:
# % of data remaining in the datset after cleaning
(final["Id"].size*1)/(filtered_data["Id"].size*1)*100

69.25890143662969

In [27]:
# another common sense aspect of data cleaning is helpfullness numerator should be less than helpullness denominator
dup=pd.read_sql_query("""
SELECT * FROM Reviews 
WHERE HelpfulnessNumerator>HelpfulnessDenominator AND Score!=3
""",con)

In [28]:
dup

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...
1,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...


In [29]:
#Therefore lets clean the final
final=final[final['HelpfulnessNumerator']<=final['HelpfulnessDenominator']]

In [30]:
final.shape

(364171, 10)

In [31]:
# Lets analyse our final dataset 
print(final.shape)

#Lets also count the total number of positive and negative present 
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

In [32]:
final.columns

Index(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator',
       'HelpfulnessDenominator', 'Score', 'Time', 'Summary', 'Text'],
      dtype='object')

In [33]:
final['Text'].shape

(364171,)

# Text Preprocessing

Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>

After which we collect the words used to describe positive and negative reviews

## To remove urls from the text
#### Syntax:
text = re.sub(r"http\S+","",text)

### To remove html tags
#### Syntax

from bs4 import BeautifulSoup ;

text=" <br> Hello Worlds </br>" ;

soup = BeautifulSoup(text, 'lxml') ;

text = soup.get_text() ;

print(text) ;

### To decontracte text
#### Function :
import re
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

### Remove words with numbers 

text=" Hello 123n JNK100 Worls " ;

text = re.sub("\S*\d\S*", "", text).strip() ;

print(text)

### Remove Special characters 

text = 'hello !@#$%^&6543 Hwolsgc' ;

text = re.sub('[^A-Za-z0-9]+', ' ', text) ;

print(text)

In [34]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

stopword=set(stopwords.words('english'))
stopword.add('br')

In [35]:
def decontracted(phrase):
    #Specific case 
    phrase=re.sub(r"won't","will not",phrase)
    phrase=re.sub(r"can't","can not",phrase)
    
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [36]:
# Combining all the steps to perform Text preprocessing 
from tqdm import tqdm       # tqdm is for viewing status bar 
preprocessed_reviews=[]
for sentance in tqdm(final['Text'].values):
    sentance=re.sub(r"http\S+","",sentance)
    sentance=BeautifulSoup(sentance,"lxml").get_text()
    sentance=decontracted(sentance)
    #sentance=re.sub("\S\d\S","",sentance).strip()
    sentance=' '.join(e.lower() for e in sentance.split() if e.isalpha())
    sentance=re.sub('[^A-Za-z0-9]+',' ',sentance)
    sentance=' '.join(e.lower() for e in sentance.split() if e.lower() not in stopword)
    preprocessed_reviews.append(sentance.strip())

100%|██████████| 364171/364171 [01:52<00:00, 3231.54it/s]


In [37]:
final['Cleaned Text']=preprocessed_reviews

In [38]:
final

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleaned Text
138706,150524,0006641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witty little book makes son laugh recite car d...
138688,150506,0006641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew reading sendak watching really rosie movi...
138689,150507,0006641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn months learn poems thro...
138690,150508,0006641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,great little book read nice rhythm well good r...
138691,150509,0006641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,book poetry months goes month cute little poem...
...,...,...,...,...,...,...,...,...,...,...,...
178145,193174,B009RSR8HO,A4P6AN2L435PV,romarc,0,0,positive,1350432000,LOVE!! LOVE!!,"LOVE, LOVE this sweetener!! I use it in all m...",love use unsweetened flavored dd unsweetened f...
173675,188389,B009SF0TN6,A1L0GWGRK4BYPT,Bety Robinson,0,0,positive,1350518400,Amazing!! Great sauce for everything!,You have to try this sauce to believe it! It s...,try sauce believe starts little sweet honey ta...
204727,221795,B009SR4OQ2,A32A6X5KCP7ARG,sicamar,1,1,positive,1350604800,Awesome Taste,I bought this Hazelnut Paste (Nocciola Spread)...,bought hazelnut paste local shop palm taste ex...
5259,5703,B009WSNWC4,AMP7K1O84DH1T,ESTY,0,0,positive,1351209600,DELICIOUS,Purchased this product at a local store in NY ...,purchased product local store ny kids love qui...


## Cleaning summary

In [39]:
preprocessed_summary=[]
for sentance in tqdm(final['Summary'].values):
    sentance=re.sub(r"http\S+","",sentance)
    sentance=BeautifulSoup(sentance,"lxml").get_text()
    sentance=decontracted(sentance)
    #sentance=re.sub("\S\d\S","",sentance).strip()
    
    sentance=re.sub('[^A-Za-z0-9]+',' ',sentance)
    sentance=' '.join(e.lower() for e in sentance.split() if e.isalpha())
    sentance=' '.join(e.lower() for e in sentance.split() if e.lower() not in stopword)
    preprocessed_summary.append(sentance.strip())

  ' Beautiful Soup.' % self._decode_markup(markup)
100%|██████████| 364171/364171 [01:14<00:00, 4885.22it/s]


In [40]:
final["Cleaned Summary"]=preprocessed_summary
final

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleaned Text,Cleaned Summary
138706,150524,0006641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witty little book makes son laugh recite car d...,every book educational
138688,150506,0006641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew reading sendak watching really rosie movi...,love book miss hard cover version
138689,150507,0006641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn months learn poems thro...,chicken soup rice months
138690,150508,0006641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,great little book read nice rhythm well good r...,good swingy rhythm reading aloud
138691,150509,0006641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,book poetry months goes month cute little poem...,great way learn months
...,...,...,...,...,...,...,...,...,...,...,...,...
178145,193174,B009RSR8HO,A4P6AN2L435PV,romarc,0,0,positive,1350432000,LOVE!! LOVE!!,"LOVE, LOVE this sweetener!! I use it in all m...",love use unsweetened flavored dd unsweetened f...,love love
173675,188389,B009SF0TN6,A1L0GWGRK4BYPT,Bety Robinson,0,0,positive,1350518400,Amazing!! Great sauce for everything!,You have to try this sauce to believe it! It s...,try sauce believe starts little sweet honey ta...,amazing great sauce everything
204727,221795,B009SR4OQ2,A32A6X5KCP7ARG,sicamar,1,1,positive,1350604800,Awesome Taste,I bought this Hazelnut Paste (Nocciola Spread)...,bought hazelnut paste local shop palm taste ex...,awesome taste
5259,5703,B009WSNWC4,AMP7K1O84DH1T,ESTY,0,0,positive,1351209600,DELICIOUS,Purchased this product at a local store in NY ...,purchased product local store ny kids love qui...,delicious


In [41]:
# # find sentance containing html tags
# i=0
# for sent in final['Text'].values:
#     if(len(re.findall('<.*?>',sent))):     
#         print(i)
#         print(sent)
#         break;
#     i=i+1
    
# # Therefore by the below text we can find there are html tags which need to be removed.

In [42]:
# remove speacial charaters
# import re
# import string
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer
# from nltk.stem.wordnet import WordNetLemmatizer
# #nltk.download('stopwords')

# #get the set of stopwords
# stop=set(stopwords.words('english'))
# sno=nltk.stem.SnowballStemmer('english') #initialse the snowball stemmer

# def cleanhtml(sentance):
#     cleanr=re.compile('<.#?>')
#     cleantext=re.sub(cleanr,'',sentance)
#     return cleantext

# def cleanpunc(sentance):
#     cleaned=re.sub(r'[!|?|\'|"|#]',r'',sentance)
#     cleaned=re.sub(r'[.|,|(|)|\|/|]',r' ',sentance)
#     return cleaned

# #function to remove the html tags from the sentance

# # def cleanhtml(sentance):
# #     cleanr=re.compile('<.#?>')
# #     cleantext=re.sub(cleanr,' ',sentance)
# #     return cleantext

# # def cleanpunc(sentance):
# #     cleaned=re.sub(r'[!|?|\'|"|#]',r'',sentance)
# #     cleaned=re.sub(r'[.|,|(|)|\|/]',r' ',sentance)
# #     return cleaned

# print(stop)
# print('='*116)
# print(sno.stem('tasty'))

# #Here is the list of stopwords

In [43]:
# len(final['Score'].values)+1

In [44]:
# # Code implementing text processing
# from tqdm.notebook import tqdm
# strin=''
# final_string=[]
# all_positive_words=[]
# all_negative_words=[]
# s=''
# for i in tqdm(range(len(final['Score'].values))):
#     for sent in final['Text'].values:
#         filtered_sentance=[]
#         sent=cleanhtml(sent)
#         for w in sent.split():
#             for cleanedwords in cleanpunc(w).split():
#                 if(cleanedwords.isalpha() and len(cleanedwords)>2): 
#                     if(cleanedwords.lower() not in stop):
#                         s=sno.stem(cleanedwords.lower())
#                         filtered_sentance.append(s)
#                         if(final['Score'][i])=='positive':
#                             all_positive_words.append(s)
#                         if(final['Score'][i])=='negative':
#                             all_negative_words.append(s)
#                     else:
#                         continue
#                 else:
#                     continue               
#         strin=" ".join(filtered_sentance)
#         final_string.append(strin)
        

In [45]:
# len(final_string)

In [46]:
# #final["Cleaned Text"]=final_string
# print(final_string[2])
# len(all_negative_words)
# len(all_positive_words)

In [47]:
# len(final_string)

In [48]:
# len(final['Score'])

In [49]:
#Code implementing text preprocessing
# i=0
# str_1=''
# final_string=[]
# all_positive_words=[]
# all_negative_words=[]
# s=''
# for sent in final['Text'].values:
#     filtered_sentance=[]
#     sent=cleanhtml(sent)
#     for w in sent.split():
#         for cleanedwords in cleanpunc(w).split():
#             if((cleanedwords.isalpha()) & len(cleanedwords)>2):
#                 if(cleanedwords.lower() not in stop):
#                     s=(sno.stem(cleanedwords.lower())).encode("utf8")
#                     filtered_sentance.append(s)
#                     if(final['Score'].values)[i]=='positive':
#                         all_positive_words.append(s)
#                     if(final['Score'].values)[i]=='negative':
#                         all_negative_words.append(s)
#                 else:
#                     continue
#             else:
#                 continue
                
#     #the filtered sentance
#     str_1=" ".join(filtered_sentance)
#     #print('-'*116)
#     final_string.append(str_1)
#     i=i+1  

In [50]:
# final['Cleaned Text']=final_string

In [51]:
# final

In [52]:
# conn=sqlite3.connect('final.sqlite')
# c=conn.cursor()

In [53]:
final.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Cleaned Text,Cleaned Summary
138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witty little book makes son laugh recite car d...,every book educational
138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew reading sendak watching really rosie movi...,love book miss hard cover version
138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn months learn poems thro...,chicken soup rice months
138690,150508,6641040,AZGXZ2UUK6X,"Catherine Hallberg ""(Kate)""",1,1,positive,1076025600,a good swingy rhythm for reading aloud,This is a great little book to read aloud- it ...,great little book read nice rhythm well good r...,good swingy rhythm reading aloud
138691,150509,6641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,book poetry months goes month cute little poem...,great way learn months


# Bag of Words

In [54]:
#BOW 
count_vect=CountVectorizer()
count_vect.fit(preprocessed_reviews);
print("Some features are ",count_vect.get_feature_names()[:10])
print("="*100)
final_counts=count_vect.transform(preprocessed_reviews)
print("Type of count vectorizer ",type(final_counts))
print("the shape of vectorizer ",final_counts.shape)
print("The number of unique words ",final_counts.get_shape()[1])

Some features are  ['aa', 'aaa', 'aaaa', 'aaaaaaarrrrrggghhh', 'aaaaaah', 'aaaaah', 'aaaaallll', 'aaaah', 'aaaahhhhhh', 'aaaallll']
Type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of vectorizer  (364171, 97590)
The number of unique words  97590


# Bi-Grams and n-Grams 

In [55]:
# removing stopwords like not should be avoided before doing n-grams
count_vect=CountVectorizer(ngram_range=(1,2))
final_bigram_counts=count_vect.fit_transform(preprocessed_reviews)
print("The type of count vectorizer ",type(final_bigram_counts))
print("The shape of text BOW vectorizer ",final_bigram_counts.shape)
print("No of unique words including uni-grams and bi-grams ",final_bigram_counts.shape[1])

The type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
The shape of text BOW vectorizer  (364171, 3384567)
No of unique words including uni-grams and bi-grams  3384567


# TF-IDF

In [56]:
tfidf_vect=TfidfVectorizer(ngram_range=(1,2))
tfidf_vect.fit(preprocessed_reviews)
print("Some of the features ",tfidf_vect.get_feature_names()[:10])
print("="*50)
final_tfidf_vect=tfidf_vect.transform(preprocessed_reviews)
print("The type of count vectorizer ",type(final_tfidf_vect))
print("The shape of TF-IDF text vectorizer ",final_tfidf_vect.shape)
print("No of unique words in tfidf vector ",final_tfidf_vect.shape[1])

Some of the features  ['aa', 'aa absolute', 'aa amazon', 'aa batteries', 'aa beans', 'aa big', 'aa brand', 'aa cell', 'aa coffee', 'aa columbian']
The type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
The shape of TF-IDF text vectorizer  (364171, 3384567)
No of unique words in tfidf vector  3384567


# Word-2-Vec

In [57]:
# Using Google News Word2Vectors

# in this project we are using a pretrained model by google
# its 3.3G file, once you load this into your memory 
# it occupies ~9Gb, so please do this step only if you have >12G of ram
# we will provide a pickle file wich contains a dict , 
# and it contains all our courpus words as keys and  model[word] as values
# To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 
# from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# it's 1.9GB in size.


model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)

In [58]:
model.wv['computer']

  """Entry point for launching an IPython kernel.


array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [59]:
model.wv.similarity('man','woman')
# to find similarity between man and woman

  """Entry point for launching an IPython kernel.


0.76640123

In [60]:
model.wv.most_similar('man')

  """Entry point for launching an IPython kernel.


[('woman', 0.7664012312889099),
 ('boy', 0.6824870109558105),
 ('teenager', 0.6586930751800537),
 ('teenage_girl', 0.6147903203964233),
 ('girl', 0.5921714305877686),
 ('suspected_purse_snatcher', 0.571636438369751),
 ('robber', 0.5585119128227234),
 ('Robbery_suspect', 0.5584409236907959),
 ('teen_ager', 0.5549196004867554),
 ('men', 0.5489763021469116)]

## Creating our own word2vec model


In [61]:
# let's create a list of all the sentance 

In [62]:
list_of_sentances=[]
for sentance in tqdm(preprocessed_reviews):
    list_of_sentances.append(sentance.split())

100%|██████████| 364171/364171 [00:03<00:00, 113092.82it/s]


In [63]:
# create a w2v on our own dataset
w2v_model=Word2Vec(list_of_sentances,min_count=5,size=50,workers=4)

In [64]:
w2v_model.wv.most_similar('man')

[('hell', 0.7304127216339111),
 ('woman', 0.7032070755958557),
 ('joke', 0.6971577405929565),
 ('guy', 0.6842848062515259),
 ('slap', 0.663162350654602),
 ('grandfather', 0.6412335634231567),
 ('creature', 0.6375335454940796),
 ('angels', 0.6372243165969849),
 ('roommate', 0.6348912715911865),
 ('buddies', 0.6339658498764038)]

In [65]:
words=list(w2v_model.wv.vocab)
len(words)

28762

In [66]:
w2v_model.wv.most_similar('tasty')

[('satisfying', 0.8155401945114136),
 ('delicious', 0.7978771924972534),
 ('tastey', 0.7425488233566284),
 ('yummy', 0.7197772264480591),
 ('surprisingly', 0.7011789083480835),
 ('hearty', 0.7003331780433655),
 ('nutritious', 0.6986072659492493),
 ('filling', 0.6903156042098999),
 ('flavorful', 0.6809139251708984),
 ('healthy', 0.6458529829978943)]

In [67]:
w2v_model.wv.most_similar('like')

[('weird', 0.7702783942222595),
 ('okay', 0.7691383957862854),
 ('kind', 0.7370659708976746),
 ('gross', 0.7236054539680481),
 ('pretty', 0.7001179456710815),
 ('ok', 0.6839250326156616),
 ('odd', 0.6772271394729614),
 ('sort', 0.676693320274353),
 ('real', 0.6710270643234253),
 ('funny', 0.6649947166442871)]

## Average word2Vec

In [68]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors=[]
for sent in tqdm(list_of_sentances):
    sent_vec=np.zeros(50) 
    cnt_words=0
    for word in sent:
        if word in words:
            vec=w2v_model.wv[word]
            sent_vec+=1
            cnt_words+=1
        if cnt_words!=0:
            sent_vec/=cnt_words
        sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

100%|██████████| 364171/364171 [11:59<00:00, 506.01it/s]

10755186
50





## Avg TF-IDF

In [69]:
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
model = TfidfVectorizer()
model.fit(preprocessed_reviews)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [70]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sentances): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in words and word in tfidf_feat:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1

100%|██████████| 364171/364171 [7:14:39<00:00, 13.96it/s]       


In [74]:
len(tfidf_sent_vectors)

364171