# Exploratory Data Analysis 

In [1]:
#!pip install pandas-profiling[notebook,html]
#!pip install plotly
#!pip install chart_studio

In [2]:
#plotly.offline doesn't push your charts to the clouds
import plotly.offline as pyo
#allows us to create the Data and Figure objects
from plotly.graph_objs import *
#plotly.plotly pushes your charts to the cloud  
# import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.express as px

# work with cufflinks offline and set its theme
import plotly.io as pio
pio.templates.default = "plotly_white"

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
from pandas_profiling import ProfileReport
from googletrans import Translator
import datetime as dt

import numpy as np

import os
import json
import re
import sys

In [3]:
print(pd.__version__)

1.0.3


In [4]:
def print_full(x):
    pd.set_option('display.max_colwidth', None)
    return x

def reset():
    pd.reset_option('display.max_colwidth')

In [8]:
df = pd.read_json('data/AMAZON_FASHION_5.json', lines=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3176 entries, 0 to 3175
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall         3176 non-null   int64  
 1   verified        3176 non-null   bool   
 2   reviewTime      3176 non-null   object 
 3   reviewerID      3176 non-null   object 
 4   asin            3176 non-null   object 
 5   style           3107 non-null   object 
 6   reviewerName    3176 non-null   object 
 7   reviewText      3160 non-null   object 
 8   summary         3176 non-null   object 
 9   unixReviewTime  3176 non-null   int64  
 10  vote            297 non-null    float64
 11  image           106 non-null    object 
dtypes: bool(1), float64(1), int64(2), object(8)
memory usage: 276.2+ KB


In [11]:
df.head(20)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
1,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Black (3746...",Tonya B.,Great product and price!,Five Stars,1441324800,,
2,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Gray L...",Tonya B.,Great product and price!,Five Stars,1441324800,,
3,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue (37867...",Tonya B.,Great product and price!,Five Stars,1441324800,,
4,5,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Pink'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
5,3,True,"05 6, 2015",A3W11493KS6Z2L,B000K2PJ4K,"{'Size:': ' Little Boys', 'Color:': ' White/Bl...",NaeNae,Waaay too small. Will use for futur children!,Oops!,1430870400,,
6,5,True,"05 6, 2015",A3W11493KS6Z2L,B000K2PJ4K,"{'Size:': ' Little Boys', 'Color:': ' Blue/Ora...",NaeNae,Stays vibrant after many washes,Great,1430870400,,
7,5,True,"05 6, 2015",A3W11493KS6Z2L,B000K2PJ4K,"{'Size:': ' Little Boys', 'Color:': ' Blue (37...",NaeNae,Stays vibrant after many washes,Good,1430870400,,
8,5,True,"05 6, 2015",A3W11493KS6Z2L,B000K2PJ4K,"{'Size:': ' Little Boys', 'Color:': ' Blue/Pink'}",NaeNae,My son really likes the pink. Ones which I was...,Great,1430870400,,
9,3,True,"05 6, 2015",A3W11493KS6Z2L,B000K2PJ4K,"{'Size:': ' Little Boys', 'Color:': ' Light Bl...",NaeNae,Waaay too small. Will use for future child.,Oops!,1430870400,,


In [17]:
df.dtypes

overall             int64
verified             bool
reviewTime         object
reviewerID         object
asin               object
style              object
reviewerName       object
reviewText         object
summary            object
unixReviewTime      int64
vote              float64
image              object
dtype: object

In [16]:
df.drop('style', axis=1).duplicated()

TypeError: unhashable type: 'list'

## Data Preparation, Cleaning and Normalization

- Capitalize all column names
- Change the Target Column name to `Radicalism`
- Change the `created at` column name to `Date`
- Map Target classes to: `Radical` `Unradical`, `Unrelated`
- Drop irrelevant Columns: `Tweet id`
- Change `Favorite Count` to integer type
- Check for Duplications and drop them
- Check for missing values

### Capitalize all column names

In [405]:
df.columns = [col.capitalize() for col in df.columns]
df.columns

Index(['Tweet id', 'Created at', 'Tweet', 'Favourite count', 'Retweet count',
       'Language', 'Author', 'In_reply_to_screen_name',
       'In_reply_to_user_id_str', 'In_reply_to_status_id_str', 'Username',
       'Radical ? 0 unrelated'],
      dtype='object')

### Change Some Column Names

In [406]:
df = df.rename(columns={"Radical ? 0 unrelated": "Radicalism",
                        "Created at": "Date"})

### Map Target classes to: `Radical` `Unradical`, `Unrelated`

In [407]:
df['Radicalism'] = df['Radicalism'].map({"T": "Radical", "F": "Nonradical", 0: "Unrelated"})

In [408]:
df['Radicalism'].value_counts()

Nonradical    11455
Radical       10133
Unrelated      2479
Name: Radicalism, dtype: int64

### Drop irrelevant Columns

In [409]:
drop_cols = ['Tweet id']

In [410]:
df.drop(drop_cols, axis=1, inplace=True)

### Cast Some Columns to Their Appropriate Data Type

In [411]:
df['Favourite count'].value_counts()

0     22082
1      1366
2       278
3        97
4        55
5        51
6        38
8        19
7        17
10       17
9        13
11        9
12        6
13        6
14        4
16        4
15        3
18        3
19        2
21        2
25        2
82        1
17        1
36        1
F         1
Name: Favourite count, dtype: int64

In [412]:
df = df[df['Favourite count'] != 'F']

In [413]:
df['Favourite count'].value_counts()

0     22082
1      1366
2       278
3        97
4        55
5        51
6        38
8        19
10       17
7        17
9        13
11        9
12        6
13        6
14        4
16        4
15        3
18        3
19        2
21        2
25        2
36        1
17        1
82        1
Name: Favourite count, dtype: int64

In [414]:
df['Favourite count'] = df['Favourite count'].astype('int64')

In [415]:
df.memory_usage(deep=True).sum()/1024**2

14.989251136779785