#### Import Dependencies

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv('./reviews.csv')
df.head()

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0
2,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0
3,B0000SX2UC,amy m. teague,3,"March 18, 2004",False,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0
4,B0000SX2UC,tristazbimmer,4,"August 28, 2005",False,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0


In [6]:
list(df.keys())

['asin', 'name', 'rating', 'date', 'verified', 'title', 'body', 'helpfulVotes']

In [7]:
df.shape

(82815, 8)

In [8]:
df.isnull().sum()

asin                0
name                5
rating              0
date                0
verified            0
title              10
body               20
helpfulVotes    49681
dtype: int64

In [9]:
del df['helpfulVotes']

In [10]:
df.dropna(inplace=True)

In [11]:
df.isnull().sum()

asin        0
name        0
rating      0
date        0
verified    0
title       0
body        0
dtype: int64

In [15]:
df.drop(['asin', 'name', 'date', 'verified', 'title'], axis=1)

Unnamed: 0,rating,body
0,3,I had the Samsung A600 for awhile which is abs...
1,1,Due to a software issue between Nokia and Spri...
2,5,"This is a great, reliable phone. I also purcha..."
3,3,"I love the phone and all, because I really did..."
4,4,The phone has been great for every purpose it ...
...,...,...
82810,5,Best phone at this price.
82811,3,If you intend to use this phone on T Mobile be...
82812,1,Here is my Moto G7 Play complaint: It freezes ...
82813,1,As far as function works great camera no go wo...


In [16]:
df.columns

Index(['asin', 'name', 'rating', 'date', 'verified', 'title', 'body'], dtype='object')

In [17]:
df["rating"].unique()

array([3, 1, 5, 4, 2], dtype=int64)

In [18]:
df["rating"].value_counts()

5    44991
1    16422
4    10708
3     5754
2     4905
Name: rating, dtype: int64

* From the values shown above, we conclude that our data is unbalanced
* To solve this problem, we will divide these values in classes

In [19]:
df["rating"] = df["rating"].apply(lambda x:0 if x<4 else 1)

In [20]:
df["rating"].value_counts()

1    55699
0    27081
Name: rating, dtype: int64

* rating = 1 --> Positive rating
* rating = 0 --> Negative rating

In [21]:
df['body'] = df['body'].str.lower()

In [22]:
import re
import string
import nltk

In [24]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [23]:
def remove_punctuation(text):
    no_punct = ''.join([c for c in text if c not in string.punctuation])
    return no_punct

In [25]:
df['body'] = df['body'].apply(lambda x: remove_punctuation(x))

In [26]:
df['body']

0        i had the samsung a600 for awhile which is abs...
1        due to a software issue between nokia and spri...
2        this is a great reliable phone i also purchase...
3        i love the phone and all because i really did ...
4        the phone has been great for every purpose it ...
                               ...                        
82810                             best phone at this price
82811    if you intend to use this phone on t mobile be...
82812    here is my moto g7 play complaint it freezes e...
82813    as far as function works great camera no go wo...
82814    what a great phone sleek fast great sounding b...
Name: body, Length: 82780, dtype: object

In [27]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\RAYA\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [28]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [29]:
sid = SentimentIntensityAnalyzer()

In [34]:
for x in range(10):
    print(sid.polarity_scores(df['body'].iloc[x]))

{'neg': 0.082, 'neu': 0.812, 'pos': 0.106, 'compound': 0.8329}
{'neg': 0.031, 'neu': 0.866, 'pos': 0.103, 'compound': 0.872}
{'neg': 0.051, 'neu': 0.822, 'pos': 0.127, 'compound': 0.886}
{'neg': 0.0, 'neu': 0.879, 'pos': 0.121, 'compound': 0.9306}
{'neg': 0.066, 'neu': 0.822, 'pos': 0.112, 'compound': 0.6542}
{'neg': 0.059, 'neu': 0.789, 'pos': 0.152, 'compound': 0.765}
{'neg': 0.038, 'neu': 0.785, 'pos': 0.177, 'compound': 0.976}
{'neg': 0.0, 'neu': 0.912, 'pos': 0.088, 'compound': 0.6705}
{'neg': 0.038, 'neu': 0.727, 'pos': 0.234, 'compound': 0.8953}
{'neg': 0.03, 'neu': 0.87, 'pos': 0.1, 'compound': 0.6486}
