# Labeling the amazon reviews with Afinn sentiment analysis.

(...)

- Sentiment score from all of the words in Amazon reviews.
- Sentiment score from only the most frequent words in Amazon reviews.

### Sentiment score from all of the words in Amazon reviews.

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import gensim
import collections
from afinn import Afinn
%matplotlib inline

In [2]:
#Download stopwords and wordnet for lemmatization (only need to be executed once)
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tajimakeijiro/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tajimakeijiro/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Use a json file made in the notebook "Milestone2.ipynb".
REVIEWS_PATH = "cleaned_reviews.json"

Start by importing the data :

In [4]:
reviews = pd.read_json(REVIEWS_PATH, lines=True)

#TBD: Which columns to keep/remove
reviews = reviews.drop(columns=['reviewerName', 'helpful', 'reviewTime'])

#Convert the utc timestamp to readable dates
reviews['unixReviewTime'] = pd.to_datetime(reviews['unixReviewTime'],unit='s')

reviews.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime
0,A1ZQZ8RJS1XVTX,0657745316,"No sugar, no GMO garbage, no fillers that come...",5,Best vanilla I've ever had,2013-10-11
1,A31W38VGZAUUM4,0700026444,"This is my absolute, undisputed favorite tea r...",5,Terrific Tea!,2012-12-06
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,1,grrrrrrr,2013-12-02
3,A3QAAOLIXKV383,1403796890,The cart is fine and works for the purpose for...,3,Storage on Wheels Cart,2011-06-12
4,AB1A5EGHHVA9M,141278509X,This product by Archer Farms is the best drink...,5,The best drink mix,2012-03-24


Create a function to process the reviews using the nltk library :
* We tokenize the sentence,
* remove any potential stop words,
* remove tokens containing only punctuations (such as '!!!', '...', etc.. which where quite common),
* remove words below a given length,
* stem the words to have them all represented in a standardized way. 

In [5]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

stop_words = set(stopwords.words('english')) 

# Here we don't need stemming, because Afinn has same score for the before stemmed ones.
# stemmer = PorterStemmer()

def process_text(sentence):
    token_words = nltk.word_tokenize(sentence)
    no_stopwords = [word.lower() for word in token_words if word not in stop_words and not \
                    all(c in string.punctuation for c in word) and not len(word) < 2]
    # return [stemmer.stem(word) for word in no_stopwords]
    return [(word) for word in no_stopwords]

print(process_text('I ordered spongbob slippers and I got John'))

['ordered', 'spongbob', 'slippers', 'got', 'john']


We add a new column to our dataframe containing the processed reviewText (notice that we only keep reviews with a low score, under the fair assumption that reviews exposing health issues would have a low rating).

In [6]:
stemmed = reviews.copy()
stemmed = stemmed[stemmed['overall'] < 3]
stemmed['reviewStemmed'] = stemmed['reviewText'].apply(lambda x : process_text(x))

stemmed.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,reviewStemmed
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,1,grrrrrrr,2013-12-02,"[ordered, spongbob, slippers, got, john, cena,..."
5,A3DTB6RVENLQ9Q,1453060375,Don't buy this item - rip off at this price. ...,1,Oops. Made a mistake and ordered this. I mis...,2013-03-03,"[do, n't, buy, item, rip, price, my, bad, mist..."
46,A3KJ9TZ2HLL7SA,5901002482,I wrote an earlier scathing review of this pro...,1,Packaging problem,2012-11-28,"[wrote, earlier, scathing, review, product, wh..."
48,ACEL2LY99MAB0,6162362183,I read the reviews before I bought it. It got ...,2,Very disappointed.,2014-04-21,"[read, reviews, bought, it, got, excited, revi..."
61,A2F3CK8F9VIFPL,616719923X,I bought it because i like green tea but the t...,1,Yuck,2013-07-29,"[bought, like, green, tea, taste, bad, came, m..."


Here, we simply store the dataframe in a pickle for later usage.

In [7]:
# stemmed.to_pickle("reviews_tokened_tenth")

In [8]:
afinn = Afinn()

# This cell takes long
stemmed['afinnWords'] = stemmed['reviewText'].apply(lambda x : afinn.find_all(x))
stemmed['afinnScores'] = stemmed['reviewText'].apply(lambda x : afinn.scores_with_pattern(x))
stemmed['afinnTotalScore'] = stemmed['reviewText'].apply(lambda x : afinn.score_with_pattern(x))
stemmed['afinnWordsLen'] = stemmed['afinnWords'].apply(lambda x : len(x))

In [9]:
stemmed.head()

Unnamed: 0,reviewerID,asin,reviewText,overall,summary,unixReviewTime,reviewStemmed,afinnWords,afinnScores,afinnTotalScore,afinnWordsLen
2,A3I0AV0UJX5OH0,1403796890,I ordered spongbob slippers and I got John Cen...,1,grrrrrrr,2013-12-02,"[ordered, spongbob, slippers, got, john, cena,...",[happy],[3],3.0,1
5,A3DTB6RVENLQ9Q,1453060375,Don't buy this item - rip off at this price. ...,1,Oops. Made a mistake and ordered this. I mis...,2013-03-03,"[do, n't, buy, item, rip, price, my, bad, mist...","[bad, mistake, pay, pay]","[-3, -2, -1, -1]",-7.0,4
46,A3KJ9TZ2HLL7SA,5901002482,I wrote an earlier scathing review of this pro...,1,Packaging problem,2012-11-28,"[wrote, earlier, scathing, review, product, wh...","[harsh, apologize, disappointed, protect, hope...","[-2, -1, -2, 1, 2, -2]",-4.0,6
48,ACEL2LY99MAB0,6162362183,I read the reviews before I bought it. It got ...,2,Very disappointed.,2014-04-21,"[read, reviews, bought, it, got, excited, revi...","[excited, good, destroyed, mad, disappointed, ...","[3, 3, -3, -3, -2, 3]",1.0,6
61,A2F3CK8F9VIFPL,616719923X,I bought it because i like green tea but the t...,1,Yuck,2013-07-29,"[bought, like, green, tea, taste, bad, came, m...","[like, bad]","[2, -3]",-1.0,2


In [10]:
stemmed.groupby(by=['afinnWordsLen']).describe()

Unnamed: 0_level_0,overall,overall,overall,overall,overall,overall,overall,overall,afinnTotalScore,afinnTotalScore,afinnTotalScore,afinnTotalScore,afinnTotalScore,afinnTotalScore,afinnTotalScore,afinnTotalScore
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
afinnWordsLen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,7287.0,1.346370,0.475846,1.0,1.0,1.0,2.0,2.0,7287.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
1,19320.0,1.381832,0.485848,1.0,1.0,1.0,2.0,2.0,19320.0,0.389337,2.176076,-4.0,-2.0,1.0,2.0,5.0
2,26116.0,1.384515,0.486490,1.0,1.0,1.0,2.0,2.0,26116.0,0.798399,3.138037,-8.0,-1.0,1.0,4.0,9.0
3,25653.0,1.381008,0.485644,1.0,1.0,1.0,2.0,2.0,25653.0,1.143219,3.932941,-10.0,-2.0,1.0,4.0,12.0
4,20578.0,1.375498,0.484263,1.0,1.0,1.0,2.0,2.0,20578.0,1.550296,4.658668,-13.0,-2.0,2.0,5.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,1.0,2.000000,,2.0,2.0,2.0,2.0,2.0,1.0,-1.000000,,-1.0,-1.0,-1.0,-1.0,-1.0
142,2.0,1.000000,0.000000,1.0,1.0,1.0,1.0,1.0,2.0,129.000000,89.095454,66.0,97.5,129.0,160.5,192.0
180,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0,1.0,-7.000000,,-7.0,-7.0,-7.0,-7.0,-7.0
202,1.0,1.000000,,1.0,1.0,1.0,1.0,1.0,1.0,52.000000,,52.0,52.0,52.0,52.0,52.0


### Sentiment score from only the most frequent words in Amazon reviews.

In [11]:
def flatten(nested_list):
    return [e for inner_list in nested_list for e in inner_list]

In [13]:
l_stemmed = stemmed['reviewStemmed'].values.tolist()
l_stemmed = flatten(l_stemmed)

In [16]:
frequentWords = collections.Counter(l_stemmed).most_common()[0:50000]
frequentWords = [word for word, count in frequentWords]

In [None]:
stemmed_frequent = stemmed.copy()
stemmed_frequent['afinnWords'] = stemmed['afinnWords'].apply(lambda x : [i for i in x if i in frequentWords])
stemmed_frequent['afinnScores'] = stemmed_frequent['reviewText'].apply(lambda x : afinn.scores_with_pattern(x))
stemmed_frequent['afinnTotalScore'] = stemmed_frequent['reviewText'].apply(lambda x : afinn.score_with_pattern(x))
stemmed_frequent['afinnWordsLen'] = stemmed_frequent['afinnWords'].apply(lambda x : len(x))

In [None]:
stemmed.head()

In [None]:
stemmed.groupby(by=['afinnWordsLen']).describe()