# Milestone 2

(...)

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import gzip
%matplotlib inline

In [13]:
REVIEWS_PATH = "reviews_Grocery_and_Gourmet_Food.json.gz"
META_PATH = "meta_Grocery_and_Gourmet_Food.json.gz"

## Data Sanitizing

In [14]:
def sanitize(path, outpath):
    """Converts a given compressed json to strict json and writes it in a new file

    Parameters
    ----------
    path : str
    The file location of the gzip-compressed json file
    outpath : str
    The path to the desired output file location 

    """
    g = gzip.open(path, 'r')

    out = open(outpath, 'w')

    for l in g:
        out.write(json.dumps(eval(l)) + '\n')
    out.close()

In [15]:
META_OUTPATH = "cleaned_meta.json"
REVIEWS_OUTPATH = "cleaned_reviews.json"

sanitize(META_PATH, META_OUTPATH)
sanitize(REVIEWS_PATH, REVIEWS_OUTPATH)

## Data Import

In [16]:
REVIEWS_PATH = "cleaned_reviews.json"
META_PATH = "cleaned_meta.json"

In [17]:
# Read-in the reviews
reviews = pd.read_json(REVIEWS_PATH, lines=True)

reviews.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1ZQZ8RJS1XVTX,0657745316,gsxrgirl,"[0, 0]","No sugar, no GMO garbage, no fillers that come...",5,Best vanilla I've ever had,1381449600,"10 11, 2013"
1,A31W38VGZAUUM4,0700026444,FIFA Lvr,"[1, 1]","This is my absolute, undisputed favorite tea r...",5,Terrific Tea!,1354752000,"12 6, 2012"
2,A3I0AV0UJX5OH0,1403796890,Alicia b,"[0, 0]",I ordered spongbob slippers and I got John Cen...,1,grrrrrrr,1385942400,"12 2, 2013"
3,A3QAAOLIXKV383,1403796890,"Danny K. Tilley ""Dan Tilley""","[0, 0]",The cart is fine and works for the purpose for...,3,Storage on Wheels Cart,1307836800,"06 12, 2011"
4,AB1A5EGHHVA9M,141278509X,CHelmic,"[1, 1]",This product by Archer Farms is the best drink...,5,The best drink mix,1332547200,"03 24, 2012"


In [19]:
#Drop some of the columns
reviews = reviews.drop(columns=['reviewerName', 'helpful', 'reviewTime', 'summary'])

In [20]:
#Convert the utc timestamp to readable dates
reviews['unixReviewTime'] = pd.to_datetime(reviews['unixReviewTime'],unit='s')

reviews.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime
0,A1ZQZ8RJS1XVTX,0657745316,"No sugar, no GMO garbage, no fillers that come...",5,2013-10-11
1,A31W38VGZAUUM4,0700026444,"This is my absolute, undisputed favorite tea r...",5,2012-12-06
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,1,2013-12-02
3,A3QAAOLIXKV383,1403796890,The cart is fine and works for the purpose for...,3,2011-06-12
4,AB1A5EGHHVA9M,141278509X,This product by Archer Farms is the best drink...,5,2012-03-24


## Data exploration

In [None]:
#TODO

## Labelling data

Using the Amazon dataset, our goal is to detect any potential harmful products by analyzing the user reviews and classifying them as presenting or not a possible health threat. Unfortunately, we lack the annotated data that would be needed to build a classifier.

We will thus create our own annotated data by using **topic modelling** with the **Latent Dirichlet Allocation (LDA)** model. Our hope is that reviews of potential health-threatening products will be assigned to their own topic, topic that we would then be able to find by analyzing the words weights associated to that topic. Our labelling data pipeline is thus as follows :
* We first start by sampling our reviews dataframe to work on a smaller set of reviews (for efficiency reasons).
* We then preprocess the reviews : we remove any stopwords and stem the words with the help of **nltk** library to standardize them.
* Using the **gensim** library, we create a corpus representing all stemmed reviews in a **bag of words** representation
* Once we have our corpus, we create and train our LDA model to create our topics.
* Now that we have our topics, we explore them to find health-related ones.
* Finally, we assign each reviews to a topic by querying our model.

In [21]:
import nltk
import gensim

In [22]:
#Download stopwords and wordnet for stemming (only need to be executed once)
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/fares/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/fares/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
reviews_label = reviews.sample(frac=0.1, replace=True, random_state=1)

reviews_label.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime
128037,AF8OGMNOEAQ0K,B000EMQFBM,This product was extremely hard to find locall...,5,2009-11-30
491755,A33BQJ9FBUONOI,B001H0FI22,I have the Nescafe Dolce Gusto machine in my o...,5,2011-01-25
470924,A26H17W5A8SALY,B001EQ5ERI,"This makes a delicious, full-bodied, creamy cu...",5,2013-05-03
491263,A1V9OL87W94ZS1,B001H0FHXW,I really like my coffee maker and do not reall...,5,2012-12-29
836489,A3CIN9F0B9LFFQ,B004H1SORE,I handed these to my husband and found empty c...,4,2014-01-12


Create a function to process the reviews using the nltk library :
* We tokenize the sentence,
* remove any potential stop words,
* remove tokens containing only punctuations (such as '!!!', '...', etc.. which where quite common),
* remove words below a given length,
* stem the words to have them all represented in a standardized way. 

In [25]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

stop_words = set(stopwords.words('english')) 

stemmer = PorterStemmer()

def process_text(sentence):
    token_words = nltk.word_tokenize(sentence)
    no_stopwords = [word.lower() for word in token_words if word not in stop_words and not \
                    all(c in string.punctuation for c in word) and not len(word) < 2]
    return [stemmer.stem(word) for word in no_stopwords]

print(process_text('I ordered spongbob slippers and I got John'))

['order', 'spongbob', 'slipper', 'got', 'john']


We add a new column to our dataframe containing the processed reviewText (notice that we only consider reviews with a low score, under the fair assumption that reviews exposing health issues would have a low rating).

In [27]:
reviews_label = reviews_label[reviews_label['overall'] < 3]
reviews_label['reviewStemmed'] = reviews_label['reviewText'].apply(lambda x : process_text(x))

reviews_label.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,unixReviewTime,reviewStemmed
925255,A1R9YASZP0VPXJ,B0052HX7Y2,Pho is one of my all-time favorite foods. When...,2,2012-03-01,"[pho, one, all-tim, favorit, food, when, saw, ..."
575956,A2XKJ1KX6XUHYP,B002863BIW,"I love Bragg's apple cider vinegar, the best t...",1,2014-07-12,"[love, bragg, 's, appl, cider, vinegar, best, ..."
914041,AN9N29UZ7R6J2,B0050OJ9WQ,"surprised , love Bacon , love chocalate , but ...",2,2013-12-13,"[surpris, love, bacon, love, chocal, done, lik..."
130091,A19AR53KOQSWFT,B000EONEU0,"My yogurt smelled like yogurt, but it was runn...",1,2013-02-03,"[my, yogurt, smell, like, yogurt, runni, like,..."
1046,A17Z1ZN1I68IVF,B00005C2M3,Don't waste your time with this seller. didn'...,1,2014-04-04,"[do, n't, wast, time, seller, n't, read, revie..."


Now, we create a dictionnary containing all the words found in our processed reviews, and our corpus consisting of all reviews in a bag of words representation.

In [29]:
from gensim import corpora
dictionary = corpora.Dictionary(reviews_label['reviewStemmed'])

corpus = [dictionary.doc2bow(text) for text in reviews_label['reviewStemmed'].values]

We now instantiante and train our LDA model.

In [30]:
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = 15, passes=15, id2word=dictionary, minimum_probability=0)

Let's now have a look at our found topics 
(...)

In [31]:
ldamodel.print_topics(num_topics=-1)

[(0,
  '0.033*"price" + 0.017*"product" + 0.014*"\'s" + 0.011*"get" + 0.011*"store" + 0.010*"per" + 0.010*"buy" + 0.009*"amazon" + 0.009*"the" + 0.008*"pay"'),
 (1,
  '0.052*"tast" + 0.033*"like" + 0.025*"n\'t" + 0.014*"flavor" + 0.013*"good" + 0.013*"one" + 0.013*"tri" + 0.012*"chocol" + 0.012*"would" + 0.011*"the"'),
 (2,
  '0.012*"gummi" + 0.012*"bear" + 0.009*"salmon" + 0.009*"34" + 0.008*"candi" + 0.008*"eat" + 0.007*"bag" + 0.007*"the" + 0.007*"licoric" + 0.006*"product"'),
 (3,
  '0.024*"order" + 0.021*"box" + 0.017*"the" + 0.014*"packag" + 0.014*"product" + 0.013*"receiv" + 0.012*"bag" + 0.012*"one" + 0.011*"n\'t" + 0.011*"item"'),
 (4,
  '0.057*"oil" + 0.049*"organ" + 0.030*"can" + 0.017*"coconut" + 0.014*"oliv" + 0.014*"dent" + 0.013*"honey" + 0.012*"soy" + 0.012*"use" + 0.011*"milk"'),
 (5,
  '0.072*"coffe" + 0.026*"cup" + 0.013*"use" + 0.013*"tast" + 0.013*"n\'t" + 0.011*"like" + 0.010*"k-cup" + 0.010*"the" + 0.010*"tri" + 0.009*"flavor"'),
 (6,
  '0.031*"flavor" + 0.017*"l

Now that we have our topics, we create a new column in our dataframe that tells us in which topic would that particular review be :

In [None]:
reviews_label['topic'] = reviews_label['reviewStemmed'].apply( \
                        lambda x: sorted(ldamodel.get_document_topics(dictionary.doc2bow(x)), \
                                key=lambda x: (x[1]), reverse=True)[0][0])
reviews_label.head() 

## What's up next