# Milestone 2

(...)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import gzip
%matplotlib inline

In [2]:
REVIEWS_PATH = "reviews_Grocery_and_Gourmet_Food.json.gz"
META_PATH = "meta_Grocery_and_Gourmet_Food.json.gz"

## Data Sanitizing

In [3]:
def sanitize(path, outpath):
    """Converts a given compressed json to strict json and writes it in a new file

    Parameters
    ----------
    path : str
    The file location of the gzip-compressed json file
    outpath : str
    The path to the desired output file location 

    """
    g = gzip.open(path, 'r')

    out = open(outpath, 'w')

    for l in g:
        out.write(json.dumps(eval(l)) + '\n')
    out.close()

In [4]:
META_OUTPATH = "cleaned_meta.json"
REVIEWS_OUTPATH = "cleaned_reviews.json"

sanitize(META_PATH, META_OUTPATH)
sanitize(REVIEWS_PATH, REVIEWS_OUTPATH)

## Data Import

In [5]:
REVIEWS_PATH = "cleaned_reviews.json"
META_PATH = "cleaned_meta.json"

In [6]:
# Read-in the reviews
reviews = pd.read_json(REVIEWS_PATH, lines=True)

reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1ZQZ8RJS1XVTX,0657745316,gsxrgirl,"[0, 0]","No sugar, no GMO garbage, no fillers that come...",5,Best vanilla I've ever had,1381449600,"10 11, 2013"
1,A31W38VGZAUUM4,0700026444,FIFA Lvr,"[1, 1]","This is my absolute, undisputed favorite tea r...",5,Terrific Tea!,1354752000,"12 6, 2012"
2,A3I0AV0UJX5OH0,1403796890,Alicia b,"[0, 0]",I ordered spongbob slippers and I got John Cen...,1,grrrrrrr,1385942400,"12 2, 2013"
3,A3QAAOLIXKV383,1403796890,"Danny K. Tilley ""Dan Tilley""","[0, 0]",The cart is fine and works for the purpose for...,3,Storage on Wheels Cart,1307836800,"06 12, 2011"
4,AB1A5EGHHVA9M,141278509X,CHelmic,"[1, 1]",This product by Archer Farms is the best drink...,5,The best drink mix,1332547200,"03 24, 2012"


In [7]:
#Drop some of the columns
reviews = reviews.drop(columns=['reviewerName', 'helpful', 'reviewTime', 'summary'])

In [8]:
#Convert the utc timestamp to readable dates
reviews['unixReviewTime'] = pd.to_datetime(reviews['unixReviewTime'],unit='s')

reviews.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime
0,A1ZQZ8RJS1XVTX,0657745316,"No sugar, no GMO garbage, no fillers that come...",5,2013-10-11
1,A31W38VGZAUUM4,0700026444,"This is my absolute, undisputed favorite tea r...",5,2012-12-06
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,1,2013-12-02
3,A3QAAOLIXKV383,1403796890,The cart is fine and works for the purpose for...,3,2011-06-12
4,AB1A5EGHHVA9M,141278509X,This product by Archer Farms is the best drink...,5,2012-03-24


## Data exploration

In [9]:
#TODO

## Labelling data

Using the Amazon dataset, our goal is to detect any potential harmful products by analyzing the user reviews and classifying them as presenting or not a possible health threat. Unfortunately, we lack the annotated data that would be needed to build a classifier.

We will thus create our own annotated data by using **topic modelling** with the **Latent Dirichlet Allocation (LDA)** model. Our hope is that reviews of potential health-threatening products will be assigned to their own topic, topic that we would then be able to find by analyzing the words weights associated to that topic. Our labelling data pipeline is thus as follows :
* We first start by sampling our reviews dataframe to work on a smaller set of reviews (for efficiency reasons).
* We then preprocess the reviews : we remove any stopwords and stem the words with the help of **nltk** library to standardize them.
* Using the **gensim** library, we create a corpus representing all stemmed reviews in a **bag of words** representation
* Once we have our corpus, we create and train our LDA model to create our topics.
* Now that we have our topics, we explore them to find health-related ones.
* Finally, we assign each reviews to a topic by querying our model.

In [10]:
import nltk
import gensim

In [11]:
#Download stopwords and wordnet for stemming (only need to be executed once)
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/fares/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/fares/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
reviews_label = reviews.sample(frac=0.15, replace=True, random_state=1000)

reviews_label.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime
1156531,A3IME1W084WPA0,B009ZLYWSG,I love this stuff. The smell is incredible wh...,5,2014-02-07
1120327,AS4WXC7O3BCKJ,B008VHEJS8,I heard about this little treat while listenin...,4,2012-11-30
134848,A2JNO2GU86EU6X,B000EUF9CK,"If you like dark chocolate, this is a good one...",5,2012-11-25
376059,A1JXD31MPJIUB6,B0017SRABQ,My Mom seemed to enjoy this alfredo sauce but ...,2,2010-10-04
18782,A2ARRIR0WCLDSN,B0001590IC,This is one of the smoothest tasting teas I've...,5,2013-11-20


In [45]:
r = reviews_label[reviews_label['overall'] < 3]
r[r['reviewText'].str.contains('health')]

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime
1010426,A2PAC9IAV4RLWV,B006BXV1H6,I was intrigued by this product because I love...,2,2012-02-24
73743,A3E6YXRC5FHZE5,B0009XQTA8,This is the 4th can that has spilled over whil...,2,2013-01-22
737638,A3FBW08ZIUYXQX,B003TO9SJS,This is not pure cold pressed extra virgin oli...,1,2014-03-15
393987,A1H7Y5XKPGT0OS,B001BM62T4,This is a premium Rooibos herbal tea (with no ...,1,2009-06-02
120718,A2WA0OX8P6NX4P,B000EGX2EG,I'm a little shocked that this so-called healt...,1,2010-01-31
...,...,...,...,...,...
240875,A1DB1MK7KADW4Z,B000LKXRNQ,The shipping was quick and the packaging great...,2,2011-07-07
105622,A3OH4OZFZGEH75,B000E46GFA,Not only does this product have too much sugar...,1,2013-03-05
884818,A2XKJ1KX6XUHYP,B004TDU0SG,Mustard sauce sells for 50% more and it is muc...,2,2013-10-01
1051235,A33PVCHCQ2BTN0,B007JFMIWW,I had high hopes for the Quaker Stila bars. I...,2,2012-08-02


Create a function to process the reviews using the nltk library :
* We tokenize the sentence,
* remove any potential stop words,
* remove tokens containing only punctuations (such as '!!!', '...', etc.. which where quite common),
* remove words below a given length,
* stem the words to have them all represented in a standardized way. 

In [46]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

stop_words = set(stopwords.words('english')) 

stemmer = PorterStemmer()

def process_text(sentence):
    token_words = nltk.word_tokenize(sentence)
    no_stopwords = [word.lower() for word in token_words if word not in stop_words and not \
                    all(c in string.punctuation for c in word) and not len(word) < 2]
    return [stemmer.stem(word) for word in no_stopwords]

print(process_text('I ordered spongbob slippers and I got John'))

['order', 'spongbob', 'slipper', 'got', 'john']


We add a new column to our dataframe containing the processed reviewText (notice that we only consider reviews with a low score, under the fair assumption that reviews exposing health issues would have a low rating).

In [47]:
reviews_label = reviews_label[reviews_label['overall'] < 3]
reviews_label['reviewStemmed'] = reviews_label['reviewText'].apply(lambda x : process_text(x))

reviews_label.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime,reviewStemmed
376059,A1JXD31MPJIUB6,B0017SRABQ,My Mom seemed to enjoy this alfredo sauce but ...,2,2010-10-04,"[my, mom, seem, enjoy, alfredo, sauc, pleas, t..."
219434,A18B0W508EEQZ7,B000JIN1H2,This product has all the texture of eating Sty...,1,2014-03-18,"[thi, product, textur, eat, styrofoam, ca, n't..."
1057182,A2I1MLM3X98M1E,B007N8SCBG,I ordered these for my son's birthday party. T...,2,2013-04-14,"[order, son, 's, birthday, parti, they, hard, ..."
720671,AXNLOYKTK4XAS,B003NGT6F8,Wish I could give this 0 stars!! I bought this...,1,2012-01-27,"[wish, could, give, star, bought, coupl, month..."
282123,A32FXPKNOHE3K,B000Q6KY86,"Often in the grocery store, you see these 3 in...",2,2014-02-25,"[often, groceri, store, see, inch, diamet, rol..."


Now, we create a dictionnary containing all the words found in our processed reviews, and our corpus consisting of all reviews in a bag of words representation.

In [55]:
from gensim import corpora
dictionary = corpora.Dictionary(reviews_label['reviewStemmed'])

corpus = [dictionary.doc2bow(text) for text in reviews_label['reviewStemmed'].values]

We now instantiante and train our LDA model.

In [56]:
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = 35, passes=15, id2word=dictionary, minimum_probability=0)

Let's now have a look at our found topics 
Interestings topics : 20 + 33 (organ keyword) and 24 (health keyword)

In [57]:
ldamodel.print_topics(num_topics=-1)

[(0,
  '0.036*"milk" + 0.014*"thi" + 0.011*"product" + 0.009*"kitchen" + 0.008*"\'s" + 0.008*"water" + 0.007*"gallon" + 0.007*"tuscan" + 0.007*"live" + 0.006*"put"'),
 (1,
  '0.053*"soup" + 0.034*"noodl" + 0.017*"chicken" + 0.016*"bowl" + 0.015*"bag" + 0.011*"the" + 0.009*"microwav" + 0.009*"water" + 0.009*"cook" + 0.008*"get"'),
 (2,
  '0.039*"jar" + 0.031*"honey" + 0.016*"product" + 0.015*"\'s" + 0.014*"babi" + 0.012*"mushroom" + 0.011*"thi" + 0.009*"truffl" + 0.009*"the" + 0.008*"n\'t"'),
 (3,
  '0.030*"flavor" + 0.029*"sauc" + 0.024*"like" + 0.023*"tast" + 0.021*"n\'t" + 0.017*"chees" + 0.016*"vanilla" + 0.015*"use" + 0.014*"\'s" + 0.013*"the"'),
 (4,
  '0.061*"bean" + 0.021*"coffe" + 0.016*"candi" + 0.014*"buy" + 0.014*"year" + 0.013*"bag" + 0.012*"tast" + 0.010*"old" + 0.010*"the" + 0.010*"\'s"'),
 (5,
  '0.017*"product" + 0.015*"\'s" + 0.013*"use" + 0.012*"n\'t" + 0.012*"tri" + 0.008*"time" + 0.008*"make" + 0.008*"eat" + 0.007*"one" + 0.007*"food"'),
 (6,
  '0.120*"coffe" + 0.02

Now that we have our topics, we create a new column in our dataframe that tells us in which topic would that particular review be :

In [58]:
reviews_label['topic'] = reviews_label['reviewStemmed'].apply( \
                        lambda x: sorted(ldamodel.get_document_topics(dictionary.doc2bow(x)), \
                                key=lambda x: (x[1]), reverse=True)[0][0])
reviews_label.head() 

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime,reviewStemmed,topic
376059,A1JXD31MPJIUB6,B0017SRABQ,My Mom seemed to enjoy this alfredo sauce but ...,2,2010-10-04,"[my, mom, seem, enjoy, alfredo, sauc, pleas, t...",25
219434,A18B0W508EEQZ7,B000JIN1H2,This product has all the texture of eating Sty...,1,2014-03-18,"[thi, product, textur, eat, styrofoam, ca, n't...",25
1057182,A2I1MLM3X98M1E,B007N8SCBG,I ordered these for my son's birthday party. T...,2,2013-04-14,"[order, son, 's, birthday, parti, they, hard, ...",12
720671,AXNLOYKTK4XAS,B003NGT6F8,Wish I could give this 0 stars!! I bought this...,1,2012-01-27,"[wish, could, give, star, bought, coupl, month...",5
282123,A32FXPKNOHE3K,B000Q6KY86,"Often in the grocery store, you see these 3 in...",2,2014-02-25,"[often, groceri, store, see, inch, diamet, rol...",15


Topic 33 seems more related to misinformative product description than health concerns.

In [66]:
reviews_label[reviews_label['topic'] == 33].reviewText.values[:5]

array(['When we bought this item, it was because we did a search for "Vermicelli". On the package, in fine print, it says "enriched macaroni product". This is not appropriate when it comes to selling a product. If I want a macaroni product, I will go to the macaroni section...',
       "This is not pure cold pressed extra virgin olive oil.  It's probably been mixed with other unhealthy oils. The true test to see if you have the real thing, place some in the fridge for a few hours.  If it turns solid, it's authentic cold pressed.  If it's still in liquid form, it's fake. This &#34;olive&#34; oil may have a decent taste, but it's certainly not the healthy, good for you oil that you may think you're getting.",
       "I found this item when I searched for Coconut Aminos.  I didn't look at the label close enough and was surprised, when they arrived, to find that this is a SOY PROTEIN product.  The whole reason for ordering this was for a soy sauce alternative.  My son is severely allergic 

Topic 20 seems more related to taste than health concerns.

In [65]:
reviews_label[reviews_label['topic'] == 20].reviewText.values[:5]

array(["Be warned:  the bananas in this variety pack are DISGUSTING!  The apples and pears taste good, like the fresh fruit tastes.  The bananas however are quite nasty and do not taste anything like fresh bananas.  It might be because of the citric acid in them, but it's got a funky smell and a tangy bite to it.  Even the color is off.  I bought this variety pack thinking I was getting a deal this way, but now that I'm throwing away all four jars of the banana it's ended up being quite expensive.  I won't be buying the variety pack again.  I'll buy jars of apple and pear again, but I'll be sticking with fresh bananas from here on.",
       "I was recently given the opportunity to try this product for free because I am a BzzAgent. I was surprised when I tasted this shake because it wasn't at all what I expected. I expected it to be a more true vanilla cappuccino flavor. Instead it tasted artificial and nothing at all like a cappuccino should taste. I couldn't even drink it all. If you 

Let us now have a quick look at the reviews associated with the topic 24 :

In [63]:
healthTopic = reviews_label[reviews_label['topic'] == 24].reviewText.values

print(healthTopic[1])
print("---")
print(healthTopic[3])
print("---")
print(healthTopic[7])

Don't be fooled by the statement &#34;No MSG Added&#34;.  I used purchased this from the grocery store back before I educated myself about reading food labels.  It tastes great but contains MSG in the form of Hydrolyzed Soy Protein and Autolyzed Yeast Extract.  Please research MSG before you consume this or any product containing it.
---
I'm day by day amazed to realized how many people in this country still doesn't know what is MSG (Monosodium Glutamate) and the dangerous effects it poses to anyone's health.MSG INDUCES OBESITY:MSG is actually injected into laboratory rats to induce obesity.  There are many many scientific studies done using these MSG treated rats since rats are not naturally overweight.  It also has been shown to increase appetite in male rats and to induce obesity in female rats and chickens.  Scientists in Spain have recently concluded that MSG when given to mice increase appetite by as much as 40%.CANCER: MSG AND CANCERThere is evidence that suggests that MSG produ

It seems like the topic 24 is the one we were looking for!

Now we can filter out our dataframe to retain only the reviews that fall into that topic, to get our labelled data :

In [67]:
labels = reviews_label[reviews_label['topic'] == 24].copy()

labels.to_pickle('labels')

labels.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime,reviewStemmed,topic
581440,A2D2ONT8HXDGP3,B0029JDVFQ,xylitol mints by epic dental were better. xyli...,2,2013-10-03,"[xylitol, mint, epic, dental, better, xylitol,...",24
260554,A1JLX1VBP5J8VP,B000N7YKQK,Don't be fooled by the statement &#34;No MSG A...,1,2014-03-28,"[do, n't, fool, statement, 34, no, msg, ad, 34...",24
1154819,A3QQ7CL7HLQOTQ,B009XG6W40,Although my family enjoyed the flavor it added...,2,2013-06-03,"[although, famili, enjoy, flavor, ad, soup, re...",24
9828,A2L73KNIMIF3TY,B0000EWQIA,I'm day by day amazed to realized how many peo...,1,2013-06-29,"['m, day, day, amaz, realiz, mani, peopl, coun...",24
246750,A3NL396F9QYDV,B000LQL9SK,I have a pantry and spice box full of Ziyad (a...,2,2014-06-30,"[pantri, spice, box, full, ziyad, shan, produc...",24


In [68]:
ldamodel.save('ldamodel.model')

## What's up next