In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from textblob import TextBlob
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import warnings
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

warnings.filterwarnings("ignore")
warnings.warn("this will not show")

pd.set_option('display.max_columns', None)

%matplotlib inline
init_notebook_mode(connected=True)
cf.go_offline()

def missing_values_analysis(df):
    missing_values = df.isnull().sum()
    missing_values_percentage = (missing_values / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Values': missing_values,
        'Percentage': missing_values_percentage
    })
    return missing_df

df = pd.read_csv("amazon.csv")
df.head()

def check_dataframe(df, head=5, tail=5):
    print("SHAPE".center(82, '~'))
    print('Rows: {}'.format(df.shape[0]))
    print('Columns: {}'.format(df.shape[1]))
    print("TYPES".center(82, '~'))
    print(df.dtypes)
    print("".center(82, '~'))
    print(missing_values_analysis(df))
    print('DUPLICATED VALUES'.center(83, '~'))
    print(df.duplicated().sum())
    print("QUANTILES".center(82, '~'))
    
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    quantiles = [0, 0.05, 0.50, 0.95, 0.99, 1]
    numeric_quantiles = df[numeric_columns].quantile(quantiles).T
    print(numeric_quantiles)

def check_class(dataframe):
    nunique_df = pd.DataFrame({'Variable': dataframe.columns,
                                'classes': [dataframe[i].nunique() \
                                            for i in dataframe.columns]})
    nunique_df = nunique_df.sort_values('classes', ascending=False)
    nunique_df = nunique_df.reset_index(drop=True)
    return nunique_df

def categorical_variable_summary(df, column_name):
    constraints = ['#B34D22', '#EBE0C0', '#1FEB0C', '#0C92EB', '#EAC2D5']

    fig = make_subplots(rows=1, cols=2, subplot_titles=('Countplot', 'Percentage'), specs=[[{'type': 'xy'}, {'type': 'domain'}]])

    fig.add_trace(
        go.Bar(
            y=df[column_name].value_counts().values.tolist(),
            x=[str(i) for i in df[column_name].value_counts().index],
            text=df[column_name].value_counts().values.tolist(),
            textfont=dict(size=14),
            name=column_name,
            textposition='auto',
            showlegend=False,
            marker=dict(color=constraints[:len(df[column_name].value_counts().index)], line=dict(color='#DBE6EC', width=1))),
        row=1, col=1
    )

    fig.add_trace(
        go.Pie(
            labels=df[column_name].value_counts().keys(),
            values=df[column_name].value_counts().values.tolist(),
            textinfo='label+percent',
            textfont=dict(size=14),
            showlegend=False,
            marker=dict(colors=constraints[:len(df[column_name].value_counts().index)], line=dict(color='#DBE6EC', width=1))),
        row=1, col=2
    )

    fig.update_layout(
        title={'text': column_name, 'y': 0.9, 'x': 0.5, 'xanchor': 'center'},
        template='plotly_white'
    )

    iplot(fig)

check_dataframe(df)
check_class(df)
categorical_variable_summary(df, 'overall')

review_example = df.reviewText[2031]
review_example = re.sub("[^a-zA-Z]", '', review_example)
df["reviewText"] = df["reviewText"].map(lambda x: re.sub("[a-zA-Z]", ' ', str(x)).lower())
df[['polarity', 'subjectivity']] = df['reviewText'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

df['sentiment'] = ""
for index, row in df.iterrows():
    score = SentimentIntensityAnalyzer().polarity_scores(row['reviewText'])
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    if neg > pos:
        df.loc[index, 'sentiment'] = "Negative"
    elif pos > neg:
        df.loc[index, 'sentiment'] = "Positive"
    else:
        df.loc[index, 'sentiment'] = "Neutral"

df[df['sentiment'] == 'Positive'].sort_values("wilson_lower_bound", ascending=False).head(5)

categorical_variable_summary(df, 'sentiment')