LIBRARIES

In [20]:
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from textblob import TextBlob
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf
%matplotlib inline
from plotly.offline import init_notebook_mode , iplot
init_notebook_mode(connected= True)
cf.go_offline();
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

pd.set_option('display.max_columns',None)







READING THE DATASET

In [21]:
df = pd.read_csv("amazon.csv")

df = df.sort_values("wilson_lower_bound" , ascending= False)
df.drop("Unnamed: 0",inplace=True, axis=1)
df.head()

Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
2031,"Hyoun Kim ""Faluzure""",5,[[ UPDATE - 6/19/2014 ]]So my lovely wife boug...,05-01-2013,702,1952,68,2020,1884,0.966337,0.957544
3449,NLee the Engineer,5,I have tested dozens of SDHC and micro-SDHC ca...,26-09-2012,803,1428,77,1505,1351,0.948837,0.936519
4212,SkincareCEO,1,NOTE: please read the last update (scroll to ...,08-05-2013,579,1568,126,1694,1442,0.92562,0.912139
317,"Amazon Customer ""Kelly""",1,"If your card gets hot enough to be painful, it...",09-02-2012,1033,422,73,495,349,0.852525,0.818577
4672,Twister,5,Sandisk announcement of the first 128GB micro ...,03-07-2014,158,45,4,49,41,0.918367,0.808109


ANALYSIS OF MISSING VALUES 

In [22]:
def missing_values_analysis(df):
    na_coloumns_ = [col for col in df.columns if df[col].isnull().sum()>0]
    n_miss  = df[na_coloumns_].isnull().sum().sort_values(ascending=True)
    ratio_ =(df[na_coloumns_].isnull().sum()/ df.shape[0]*100).sort_values(ascending =True)
    missing_df = pd.concat([n_miss,np.round(ratio_, 2)] , axis=1 , keys=['Missing Values' , 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

def check_dataframe (df , head=5 , tail = 5):
    print("SHAPE".center(82,'~'))
    print('Rows : {}'.format(df.shape[0]))
    print('Columns: {}'.format(df.shape[1]))
    print(df.dtypes)
    print("".center(82,'~'))
    print(missing_values_analysis(df))
    print(df.duplicated().sum())
    print(df.duplicated().sum())
    print(df.quantile([0,0.05,0.50,0.95,0.99,1]).T)

check_dataframe(df)


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~SHAPE~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Rows : 4915
Columns: 11
reviewerName             object
overall                   int64
reviewText               object
reviewTime               object
day_diff                  int64
helpful_yes               int64
helpful_no                int64
total_vote                int64
score_pos_neg_diff        int64
score_average_rating    float64
wilson_lower_bound      float64
dtype: object
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
              Missing Values  Ratio
reviewerName               1   0.02
reviewText                 1   0.02
0
0
                       0.00  0.05   0.50        0.95       0.99         1.00
overall                 1.0   2.0    5.0    5.000000    5.00000     5.000000
day_diff                1.0  98.0  431.0  748.000000  943.00000  1064.000000
helpful_yes             0.0   0.0    0.0    1.000000    3.00000  1952.000000
helpful_no              

UNIQUE VALUES 

In [23]:
def check_class(dataframe):
    nunique_df  = pd.DataFrame({'variable':dataframe.columns , 'Classes':[dataframe[i].nunique() \
        for i in dataframe.columns]})

    nunique_df = nunique_df.sort_values('Classes' , ascending= False)
    nunique_df = nunique_df.reset_index(drop = True)
    return nunique_df

check_class(df)

Unnamed: 0,variable,Classes
0,reviewText,4912
1,reviewerName,4594
2,reviewTime,690
3,day_diff,690
4,wilson_lower_bound,40
5,score_average_rating,28
6,score_pos_neg_diff,27
7,total_vote,26
8,helpful_yes,23
9,helpful_no,17


CATAGORICAL VARIABLE ANALYSIS

In [24]:
constraints  = ['#B34D22' ,'#EBE00C' , '#1FEB0C' , '#0C92EB' , '#EB0CD5']
def categorical_variable_summary(df,column_name):
    fig = make_subplots(rows=1,cols=2, subplot_titles=('Countlot','Percentage'), specs =[[{"type": "xy"} , {"type" : 'domain'}]])

    fig.add_trace(go.Bar( y=df[column_name].value_counts().values.tolist(),
    x=[str(i) for i in df[column_name].value_counts().index],
    text = df[column_name].value_counts().values.tolist(), 
    textfont = dict(size=14),
    name = column_name , 
    textposition ='auto' , 
    showlegend =False , 
    marker=dict(color= constraints ,
    line = dict(color='#DBE6EC',
    width = 1))),row=1 , col =1 )

    fig.add_trace(go.Pie(labels=df[column_name].value_counts().keys() , 
    values = df[column_name].value_counts().values ,
    textfont= dict(size = 18),
    textposition='auto',
    showlegend=False,
    name = column_name , 
    marker= dict(colors =constraints)), row =1 , col =2)

    fig.update_layout(title = {
        'text':column_name,
        'y':0.9,
        'x':0.5,
        'xanchor':'center',
        'yanchor':'top',

    } , template ='plotly_white')

    iplot(fig) 

categorical_variable_summary(df, 'overall')

CLEANING OF THE DATA 
- REMOVING OF PUNTUATION MARKS AND OTHER THINGS
- CONVERTING EVERYTHING TO LOWERCASE SO THAT OUR ML ALGO WORKS PROPERLY



In [25]:
df.reviewText.head()

reviwe_example = df.reviewText[2031]
reviwe_example = re.sub("[^a-zA-Z]" ,' ',reviwe_example )
reviwe_example = reviwe_example.lower().split()
reviwe_example


['update',
 'so',
 'my',
 'lovely',
 'wife',
 'bought',
 'me',
 'a',
 'samsung',
 'galaxy',
 'tab',
 'for',
 'father',
 's',
 'day',
 'and',
 'i',
 've',
 'been',
 'loving',
 'it',
 'ever',
 'since',
 'just',
 'as',
 'other',
 'with',
 'samsung',
 'products',
 'the',
 'galaxy',
 'tab',
 'has',
 'the',
 'ability',
 'to',
 'add',
 'a',
 'microsd',
 'card',
 'to',
 'expand',
 'the',
 'memory',
 'on',
 'the',
 'device',
 'since',
 'it',
 's',
 'been',
 'over',
 'a',
 'year',
 'i',
 'decided',
 'to',
 'do',
 'some',
 'more',
 'research',
 'to',
 'see',
 'if',
 'sandisk',
 'offered',
 'anything',
 'new',
 'as',
 'of',
 'their',
 'product',
 'lineup',
 'for',
 'microsd',
 'cards',
 'from',
 'worst',
 'to',
 'best',
 'performance',
 'wise',
 'are',
 'the',
 'as',
 'follows',
 'sandisksandisk',
 'ultrasandisk',
 'ultra',
 'plussandisk',
 'extremesandisk',
 'extreme',
 'plussandisk',
 'extreme',
 'pronow',
 'the',
 'difference',
 'between',
 'all',
 'of',
 'these',
 'cards',
 'are',
 'simply',
 

In [26]:
rt = lambda x :re.sub("[^a-zA-Z]" , ' ', str(x))
df["reviewText"] = df["reviewText"].map(rt)
df["reviewText"] = df["reviewText"] . str.lower()
df.head()

Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
2031,"Hyoun Kim ""Faluzure""",5,update so my lovely wife boug...,05-01-2013,702,1952,68,2020,1884,0.966337,0.957544
3449,NLee the Engineer,5,i have tested dozens of sdhc and micro sdhc ca...,26-09-2012,803,1428,77,1505,1351,0.948837,0.936519
4212,SkincareCEO,1,note please read the last update scroll to ...,08-05-2013,579,1568,126,1694,1442,0.92562,0.912139
317,"Amazon Customer ""Kelly""",1,if your card gets hot enough to be painful it...,09-02-2012,1033,422,73,495,349,0.852525,0.818577
4672,Twister,5,sandisk announcement of the first gb micro ...,03-07-2014,158,45,4,49,41,0.918367,0.808109


SENTIMENT ANALYSIS


In [27]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
df[['polarity' , 'subjectivity']] = df['reviewText'].apply(lambda Text:pd.Series(TextBlob(Text).sentiment))

for index ,row in df['reviewText'].iteritems():
    score = SentimentIntensityAnalyzer().polarity_scores(row)

    neg = score['neg']
    neu = score['neu']
    pos = score['pos']

    if neg > pos:
        df.loc[index , 'sentiment'] = "Negative"
    elif pos > neg:
        df.loc[index , 'sentiment'] = "Positive"
    else:
        df.loc[index , 'sentiment'] = "Neutral"        

In [31]:
df[df["sentiment"] == "Positive"].sort_values("wilson_lower_bound", ascending=False).head(5)
categorical_variable_summary(df,'sentiment')