# Use VADER to analyze Reviews
https://towardsdatascience.com/sentimental-analysis-using-vader-a3415fef7664

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to C:\Users\A Girl's
[nltk_data]     Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
df_vader = pd.read_csv("Resources/helpful_clean_reviews_combined.csv")
df_vader.head()

Unnamed: 0,key,stars,helpful_yes,helpful_no,text,rating
0,0_breyers,1,11,0,I am interested in the flavoring components us...,4.1
1,0_breyers,1,7,0,"Boy, was I surprised when I got my Bryers home...",4.1
2,0_breyers,1,8,0,I havent purchased this product in awhile and ...,4.1
3,0_breyers,1,4,0,The Natural Vanilla recipe change to include T...,4.1
4,0_breyers,5,21,2,I had the same issue with breyers. I finally f...,4.1


In [3]:
df_vader = df_vader.drop(["helpful_yes","helpful_no","rating"], axis=1)
df_vader = df_vader.reindex(["key","text","stars"], axis=1)
df_vader.head()

Unnamed: 0,key,text,stars
0,0_breyers,I am interested in the flavoring components us...,1
1,0_breyers,"Boy, was I surprised when I got my Bryers home...",1
2,0_breyers,I havent purchased this product in awhile and ...,1
3,0_breyers,The Natural Vanilla recipe change to include T...,1
4,0_breyers,I had the same issue with breyers. I finally f...,5


In [4]:
# assign 'pos' for positive sentiment, 'neg' for negative
# if stars 4 or higher, sentiment is positive

def applyFunc(s):
    if s >= 4:
        return 'pos'
    else:
        return 'neg'

# populate column        
df_vader['sentiment'] = df_vader['stars'].apply(applyFunc)
df_vader.head()

Unnamed: 0,key,text,stars,sentiment
0,0_breyers,I am interested in the flavoring components us...,1,neg
1,0_breyers,"Boy, was I surprised when I got my Bryers home...",1,neg
2,0_breyers,I havent purchased this product in awhile and ...,1,neg
3,0_breyers,The Natural Vanilla recipe change to include T...,1,neg
4,0_breyers,I had the same issue with breyers. I finally f...,5,pos


In [5]:
# create vader_sentiment_score column to hold polarity_score dictionaries
df_vader['vader_sentiment_score'] = df_vader['text'].apply(lambda review: sid.polarity_scores(review))

df_vader.head()

Unnamed: 0,key,text,stars,sentiment,vader_sentiment_score
0,0_breyers,I am interested in the flavoring components us...,1,neg,"{'neg': 0.042, 'neu': 0.823, 'pos': 0.135, 'co..."
1,0_breyers,"Boy, was I surprised when I got my Bryers home...",1,neg,"{'neg': 0.0, 'neu': 0.957, 'pos': 0.043, 'comp..."
2,0_breyers,I havent purchased this product in awhile and ...,1,neg,"{'neg': 0.128, 'neu': 0.757, 'pos': 0.115, 'co..."
3,0_breyers,The Natural Vanilla recipe change to include T...,1,neg,"{'neg': 0.027, 'neu': 0.844, 'pos': 0.129, 'co..."
4,0_breyers,I had the same issue with breyers. I finally f...,5,pos,"{'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'comp..."


In [6]:
# create compound column to hold total score
df_vader['compound']  = df_vader['vader_sentiment_score'].apply(lambda score_dict: score_dict['compound'])
df_vader.head()

Unnamed: 0,key,text,stars,sentiment,vader_sentiment_score,compound
0,0_breyers,I am interested in the flavoring components us...,1,neg,"{'neg': 0.042, 'neu': 0.823, 'pos': 0.135, 'co...",0.9804
1,0_breyers,"Boy, was I surprised when I got my Bryers home...",1,neg,"{'neg': 0.0, 'neu': 0.957, 'pos': 0.043, 'comp...",0.2263
2,0_breyers,I havent purchased this product in awhile and ...,1,neg,"{'neg': 0.128, 'neu': 0.757, 'pos': 0.115, 'co...",-0.1553
3,0_breyers,The Natural Vanilla recipe change to include T...,1,neg,"{'neg': 0.027, 'neu': 0.844, 'pos': 0.129, 'co...",0.7776
4,0_breyers,I had the same issue with breyers. I finally f...,5,pos,"{'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'comp...",0.6124


In [7]:
# Determine if overall sentiment is positive or negative
# if compound value >0, then 'pos', otherwise its 'neg'
df_vader['comp_score'] = df_vader['compound'].apply(lambda c: 'pos' if c >0 else 'neg')
df_vader.head()

Unnamed: 0,key,text,stars,sentiment,vader_sentiment_score,compound,comp_score
0,0_breyers,I am interested in the flavoring components us...,1,neg,"{'neg': 0.042, 'neu': 0.823, 'pos': 0.135, 'co...",0.9804,pos
1,0_breyers,"Boy, was I surprised when I got my Bryers home...",1,neg,"{'neg': 0.0, 'neu': 0.957, 'pos': 0.043, 'comp...",0.2263,pos
2,0_breyers,I havent purchased this product in awhile and ...,1,neg,"{'neg': 0.128, 'neu': 0.757, 'pos': 0.115, 'co...",-0.1553,neg
3,0_breyers,The Natural Vanilla recipe change to include T...,1,neg,"{'neg': 0.027, 'neu': 0.844, 'pos': 0.129, 'co...",0.7776,pos
4,0_breyers,I had the same issue with breyers. I finally f...,5,pos,"{'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'comp...",0.6124,pos


In [8]:
# View VADER sentiment ranking

vader_ranking = pd.DataFrame(df_vader, columns=['key','stars','sentiment','compound','comp_score'])
vader_review_rank = vader_ranking.sort_values('comp_score', ascending=False)

print(str('Top 20 Vader Sentiment Scores'))
print(str('Ranked by Compound Value'))
display(vader_review_rank[:20])

Top 20 Vader Sentiment Scores
Ranked by Compound Value


Unnamed: 0,key,stars,sentiment,compound,comp_score
0,0_breyers,1,neg,0.9804,pos
2205,4_bj,5,pos,0.5562,pos
2194,39_talenti,2,neg,0.6884,pos
2195,39_talenti,5,pos,0.9811,pos
2197,39_talenti,1,neg,0.6885,pos
2198,39_talenti,5,pos,0.8934,pos
2199,39_talenti,5,pos,0.9865,pos
2200,39_talenti,5,pos,0.964,pos
2201,39_talenti,5,pos,0.5093,pos
2202,39_talenti,5,pos,0.6369,pos


In [9]:
print(str('Bottom 20 Vader Sentiment Scores'))
print(str('Ranked by Compound Value'))
display(vader_review_rank[-20:])

Bottom 20 Vader Sentiment Scores
Ranked by Compound Value


Unnamed: 0,key,stars,sentiment,compound,comp_score
3056,6_talenti,5,pos,-0.2732,neg
1373,29_bj,5,pos,0.0,neg
2220,4_bj,1,neg,-0.7261,neg
2591,46_bj,5,pos,-0.1422,neg
2592,46_bj,5,pos,-0.6578,neg
720,18_breyers,1,neg,0.0,neg
3049,6_talenti,1,neg,-0.5267,neg
3048,6_talenti,1,neg,-0.7101,neg
735,18_talenti,5,pos,0.0,neg
3045,6_talenti,1,neg,-0.128,neg


In [10]:
# Evaluate how well VADER performed with predicting the same sentiment as our original formula
# this is a way to test how reliable our formula is

# Create predicted column to hold 1 if predicted correctly, and 0 if otherwise
df_vader['predicted'] =""


for s in range(len(df_vader)):
    if (df_vader.sentiment[s] == 'pos') & (df_vader.comp_score[s] == 'pos'):
        df_vader['predicted'][s] = 1

    elif (df_vader.sentiment[s] == 'neg') & (df_vader.comp_score[s] == 'neg'):
        df_vader['predicted'][s] = 1

    else:
        df_vader['predicted'][s] = 0
         
df_vader.head()

Unnamed: 0,key,text,stars,sentiment,vader_sentiment_score,compound,comp_score,predicted
0,0_breyers,I am interested in the flavoring components us...,1,neg,"{'neg': 0.042, 'neu': 0.823, 'pos': 0.135, 'co...",0.9804,pos,0
1,0_breyers,"Boy, was I surprised when I got my Bryers home...",1,neg,"{'neg': 0.0, 'neu': 0.957, 'pos': 0.043, 'comp...",0.2263,pos,0
2,0_breyers,I havent purchased this product in awhile and ...,1,neg,"{'neg': 0.128, 'neu': 0.757, 'pos': 0.115, 'co...",-0.1553,neg,1
3,0_breyers,The Natural Vanilla recipe change to include T...,1,neg,"{'neg': 0.027, 'neu': 0.844, 'pos': 0.129, 'co...",0.7776,pos,0
4,0_breyers,I had the same issue with breyers. I finally f...,5,pos,"{'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'comp...",0.6124,pos,1


In [11]:
# create VADER_sentiment_score
df_vader.to_csv("Resources/VADER_sentiment_score.csv", index=False)

In [12]:
# calculate overall prediction accuracy
pred_rate_all = df_vader['predicted'].sum() / df_vader['predicted'].count()
pred_rate_all

0.8221378504672897

In [13]:
# Create negative sentiment dataframe
df_neg_sent = df_vader[df_vader['sentiment'] =='neg']

# calculate negative rate prediction accuracy
pred_rate_neg = df_neg_sent['predicted'].sum() / df_neg_sent['predicted'].count()
pred_rate_neg

0.37372262773722625

In [14]:
# Create positive sentiment dataframe

df_pos_sent = df_vader[df_vader['sentiment'] =='pos']

# calculate positive rate prediction accuracy
pred_rate_pos = df_pos_sent['predicted'].sum() / df_pos_sent['predicted'].count()
pred_rate_pos

0.9342825848849945

In [15]:
print ('VADER Sentiment Intensity Analyzer Summary')
print ('------------------------------------------')
print ('VADER Predicted the sentiment for ',round(pred_rate_all*100),'% of all reviews combined correctly.')
print ('VADER Predicted the sentiment for ',round(pred_rate_neg*100),'% of negavive reviews correctly.')
print ('VADER Predicted the sentiment for ',round(pred_rate_pos*100),'% of positive reviews correctly.')

VADER Sentiment Intensity Analyzer Summary
------------------------------------------
VADER Predicted the sentiment for  82 % of all reviews combined correctly.
VADER Predicted the sentiment for  37 % of negavive reviews correctly.
VADER Predicted the sentiment for  93 % of positive reviews correctly.


Positive reviews are what we want as we hope the reviews will aide in identifying a new specialty flavor.  The VADER Sentiment Analyzer agrees with our formula for positive sentiment 93% of the time. That is GREAT!

In theory, we should be able to calculate the average "compound value" for each key, followed by ranking the top 10 to see which products achieved the highest sentiment based on text alone. 

So lets see....

In [16]:
# create df df_pos_sent_clean to focus only on relevant columns
df_pos_sent_clean = pd.DataFrame(df_pos_sent, columns=['key','compound','predicted'])

# filter df_pos_sent_clean down to the 93% positive prediction agreement
df_pos_sent_clean = df_pos_sent_clean[df_pos_sent_clean['predicted'] ==1]
df_pos_sent_clean.head(3)

Unnamed: 0,key,compound,predicted
4,0_breyers,0.6124,1
56,0_breyers,0.9502,1
68,0_breyers,0.4497,1


In [17]:
# Calculate the average compound rating and reveal top 10 
df_key_avg = df_pos_sent_clean.groupby('key').mean()

df_key_avg

Unnamed: 0_level_0,compound
key,Unnamed: 1_level_1
0_breyers,0.670767
0_hd,0.760012
0_talenti,0.862295
10_bj,0.754062
10_breyers,0.896914
...,...
8_bj,0.852022
8_hd,0.766850
8_talenti,0.801650
9_bj,0.822423


In [18]:
# create products table with key and ice cream name
products = pd.read_csv("Resources/products.csv")
products = products.drop(["brand","subhead","description","rating","rating_count","ingredients"], axis=1)
products.head()

Unnamed: 0,key,name
0,0_bj,Salted Caramel Core
1,1_bj,Netflix & Chilll'd™
2,2_bj,Chip Happens
3,3_bj,Cannoli
4,4_bj,Gimme S’more!™


In [19]:
# Merge products and key_avg dataframes to get ice cream name for each key
df_key_avg = pd.merge(products, 
                      df_key_avg, 
                      on ='key', 
                      how ='inner')
df_key_avg

Unnamed: 0,key,name,compound
0,1_bj,Netflix & Chilll'd™,0.821515
1,2_bj,Chip Happens,0.884748
2,4_bj,Gimme S’more!™,0.830148
3,5_bj,Peanut Butter Half Baked®,0.860443
4,6_bj,Berry Sweet Mascarpone,0.792571
...,...,...,...
178,60_breyers,Chocolate Peanut Butter,0.851583
179,61_breyers,Natural Vanilla Snack Cups 10ct,0.887400
180,63_breyers,Chocolate Snack Cups 10ct,0.855750
181,64_breyers,CINNABON®,0.693760


In [20]:
# View Top 20 Ice Cream Flavors

df_key_avg = df_key_avg.sort_values('compound', ascending=False)

print(str('        Top 20 Ice Cream Flavors'))
print(str('        Ranked by Average Compound Value'))
display(df_key_avg[:20])

        Top 20 Ice Cream Flavors
        Ranked by Average Compound Value


Unnamed: 0,key,name,compound
146,5_breyers,Mint Chocolate Chip,0.9505
154,17_breyers,Vanilla Caramel,0.9428
150,11_breyers,French Vanilla,0.94168
92,56_hd,Strawberry Waffle Cone HEAVEN Light Ice Cream,0.940275
160,25_breyers,No Sugar Added Vanilla Chocolate Strawberry,0.9365
172,50_breyers,Non-Dairy OREO® Cookies & Cream,0.935789
164,31_breyers,SNICKERS®,0.9346
62,18_hd,Coconut Caramel Non-Dairy,0.930483
67,24_hd,Coffee Vanilla Chocolate TRIO CRISPY LAYERS,0.92912
116,15_talenti,COLD BREW COFFEE SORBETTO,0.928032


In [21]:
# View Bottom 20 Ice Cream Flavors
print(str('        Bottom 20 Ice Cream Flavors'))
print(str('        Ranked by Average Compound Value'))
display(df_key_avg[-20:])

        Bottom 20 Ice Cream Flavors
        Ranked by Average Compound Value


Unnamed: 0,key,name,compound
139,41_talenti,STRAWBERRY HIBISCUS SORBETTO,0.71881
69,26_hd,Cookies and Cream Ice Cream,0.71775
101,68_hd,Whiskey Hazelnut Latte Ice Cream,0.71638
98,64_hd,Vanilla Chocolate Chip Ice Cream,0.713725
165,33_breyers,CarbSmart™ Almond Bar,0.7086
142,1_breyers,Homemade Vanilla,0.706433
40,48_bj,Vanilla Caramel Fudge,0.7051
181,64_breyers,CINNABON®,0.69376
141,0_breyers,Natural Vanilla,0.670767
157,21_breyers,Vanilla Fudge Twirl,0.661625
