# | AMAZON SENTIMENT ANALYSIS 

# Installing and Importing The Libraries

In [3]:
# Importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')


import re
from textblob import TextBlob

from wordcloud import WordCloud
import cufflinks as cf

%matplotlib inline
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")
warnings.warn('this will not show')

from tqdm import tqdm

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mosta\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!

Blowfish has been deprecated



In [4]:
# installing textblob
!pip install textblob



In [None]:
# installing wordcloud
!pip install wordcloud

In [None]:
# installing cufflinks
!pip install cufflinks

# Reading the file

In [None]:
df = pd.read_csv(r'E:\The Journey\Career\Data Science\_Portfolio Projects\Python Projects\Sentiment-Analysis-Amazon\amazon.csv')
df.head()

In [None]:
# modifying the df format

df = df.sort_values('wilson_lower_bound', ascending = False)
df.drop('Unnamed: 0', inplace = True, axis = 1)
df.head()

# Exploring and Preparing The Dataset

In [None]:
# Defining null values analysis function

def missing_values_analysis(df):
    na_columns = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns].isnull().sum().sort_values(ascending = True)
    ratio_ = (df[na_columns].isnull().sum() / df.shape[0]*100).sort_values(ascending = True)
    missing_df = pd.concat([n_miss , np.round(ratio_,2)], axis = 1, keys=['Missing Values','Ratio'])
    return missing_df

# Defining Overview function

def dataframe_overview(df, head=5, tail=5):
    print('SHAPE'.center(82,'~'))
    print('Row: {}'.format(df.shape[0]))
    print('Column: {}'.format(df.shape[1]))
    print('TYPES'.center(82,'~'))
    print(df.dtypes)
    print(''.center(82,'~'))
    print(missing_values_analysis(df))
    print('DUPLICATION'.center(83,'~'))
    print(df.duplicated().sum())
    print('QUANTILES'.center(82,'~'))
    print(df.quantile([0,0.05, 0.5, 0.95, 0.99, 1]).T)
    
dataframe_overview(df)
    
    
    

In [None]:
# Check uniqueness

def check_uniqueness(df):
    nunique_df = pd.DataFrame({'Column': df.columns, 
                              'Nunique': [df[col].nunique()\
                                          for col in df.columns]})
    nunique_df = nunique_df.sort_values('Nunique', ascending = False)
    nunique_df = nunique_df.reset_index(drop = True)
    return nunique_df

check_uniqueness(df)

# Categorical Data Overview

In [None]:
# My Color Palette

colors = ['#FF5733', '#33FF57', '#5733FF', '#FF33A1', '#33A1FF']

# Function Defination

def categorical_variable_summary(df, column_name):
    fig = make_subplots(rows = 1,
                       cols = 2,
                       subplot_titles = ('Countplot','Percentages'),
                       specs = [[{'type':'xy'},{'type':'domain'}]])
    
# Bar Plot Defination 

    fig.add_trace(go.Bar(x = [str(i) for i in df[column_name].value_counts().index],
                         y = df[column_name].value_counts().values.tolist(),
                         text = df[column_name].value_counts().values.tolist(),
                         textfont = dict(size = 15),
                         textposition = 'auto',
                         showlegend = False,
                         marker = dict(color = colors,
                                     line = dict(color = '#DBE6EC',
                                                width = 1))),
                row = 1, col = 1)
    
    
# Pie Plot Defination

    fig.add_trace(go.Pie(labels = [str(i) for i in df[column_name].value_counts().keys()],
                         values = df[column_name].value_counts().values,
                         textfont = dict(size = 20),
                         textposition = 'auto',
                         showlegend = False,
                         marker = dict(colors = colors)),
                 row = 1, col = 2)
    
    fig.update_layout(title={'text':column_name,
                             'y' : 0.9,
                             'x' : 0.5,
                             'xanchor':'center',
                             'yanchor':'top'},
                     template = 'plotly_white')
    fig.show()
                       

# Overview on Overall Score

In [None]:
categorical_variable_summary(df,'overall')

# Cleaning The Underlined Texts for Analysis

In [None]:
# Having a glance on what the review comments have

df.reviewText.head()

In [None]:
# Example of a review

review_example = df.reviewText[2031]
review_example

In [None]:
# Example of removing any character which is not a letter

review_example = re.sub('[^a-zA-Z]',' ',review_example)
review_example

In [None]:
review_example = review_example.lower().split()

In [None]:
review_example

In [None]:
# Cleaing The Actual Underlined Text

rt = lambda x : re.sub('[^a-zA-Z]',' ',str(x))
df['reviewText'] = df['reviewText'].map(rt)
df['reviewText'] = df['reviewText'].str.lower()
df.head()

# Sentiment Analysis

In [None]:
# General Sentiment Analysis with TextBlob

df[['polarity','subjectivity']] = df['reviewText'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

# Detailed Sentiment Analysis with SentimentIntensityAnalyzer
for lab,row in tqdm(df['reviewText'].iteritems(),total = len(df)):
    
    score = SentimentIntensityAnalyzer().polarity_scores(row) #dict
    
    neg = score['neg'] #calling dict_key to get dict_value
    neu = score['neu']
    pos = score['pos']
    
    if neg>pos:
        df.loc[lab,'sentiment'] = 'Negative'
    elif pos>neg:
        df.loc[lab,'sentiment'] = 'Positive'
    else:
        df.loc[lab,'sentiment'] = 'Neutral'

In [None]:
# Having a look

df[df['sentiment']=='Positive'].sort_values('wilson_lower_bound',ascending = False).head(5)

## Sentiment Analysis Plotting

In [None]:
categorical_variable_summary(df,'sentiment')

## The Sentiment and The Overall Score 

In [None]:
# overall_vs_sentiment Series

overall_vs_sentiment = df.groupby('sentiment')['overall'].value_counts().sort_index(level=['sentiment','overall'])
overall_vs_sentiment

In [None]:
# overall_vs_sentiment Plot

# My Color Palette

colors = ['#FF5733', '#33FF57', '#5733FF', '#FF33A1', '#33A1FF']

fig = make_subplots(rows = 1,
                       cols = 3,
                       subplot_titles = ('Negative','Neutral','Positive'),
                       specs = [[{'type':'xy'},{'type':'xy'},{'type':'xy'}]],
                       shared_yaxes=True)


# Negative Sentiment vs Overall Score Plot

fig.add_trace(go.Bar(x = [str(i) for i in overall_vs_sentiment.loc['Negative'].index],
                     y = overall_vs_sentiment.loc['Negative'].values.tolist(),
                     text = overall_vs_sentiment.values.tolist(),
                     textfont = dict(size = 15),
                     textposition = 'auto',
                     showlegend = False,
                     marker = dict(color = colors,
                                    line = dict(color = '#DBE6EC',
                                                width = 1))),
                row = 1, col = 1)

# Neutral Sentiment vs Overall Score Plot

fig.add_trace(go.Bar(x = overall_vs_sentiment.loc['Neutral'].index,
                     y = overall_vs_sentiment.loc['Neutral'].values.tolist(),
                     text = overall_vs_sentiment.loc['Neutral'].values.tolist(),
                     textfont = dict(size = 15),
                     textposition = 'auto',
                     showlegend = False,
                     marker = dict(color = colors,
                                    line = dict(color = '#DBE6EC',
                                                width = 1))),
                row = 1, col = 2)

# Positive Sentiment vs Overall Score Plot

fig.add_trace(go.Bar(x = overall_vs_sentiment.loc['Positive'].index,
                     y = overall_vs_sentiment.loc['Positive'].values.tolist(),
                     text = overall_vs_sentiment.loc['Positive'].values.tolist(),
                     textfont = dict(size = 15),
                     textposition = 'auto',
                     showlegend = False,
                     marker = dict(color = colors,
                                    line = dict(color = '#DBE6EC',
                                                width = 1))),
                row = 1, col = 3)
fig.show()

    