# Explorartory Data Analysis

# 1)- Importing key modules

In [1]:
# support both Python 2 and Python 3 with minimal overhead.
from __future__ import absolute_import, division, print_function

# I am an engineer. I care only about error not warning. So, let's be maverick and ignore warnings.
import warnings
warnings.filterwarnings('ignore')

In [2]:
import re    # for regular expressions 
import nltk  # for text manipulation 
#For other text data
from collections import Counter
import scattertext as st
import spacy
from pprint import pprint
import en_core_web_sm
nlp = spacy.load('en_core_web_sm')

import string 
import numpy as np 
import pickle
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
#https://textblob.readthedocs.io/en/dev/install.html
from textblob import TextBlob, Word, Blobber
from textblob.classifiers import NaiveBayesClassifier
from textblob.taggers import NLTKTagger

In [4]:
#For Visuals
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from matplotlib import rcParams
rcParams['figure.figsize'] = 11, 8
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [5]:
# for dynamic visuals
import plotly as py
import cufflinks as cf

In [6]:
# for offline mode in notebook
py.offline.init_notebook_mode(connected=True)
cf.go_offline()

In [7]:
%reload_ext version_information
%version_information pandas,numpy, nltk, seaborn, matplotlib

Software,Version
Python,3.7.7 64bit [MSC v.1916 64 bit (AMD64)]
IPython,7.13.0
OS,Windows 10 10.0.17763 SP0
pandas,1.0.3
numpy,1.18.1
nltk,3.5
seaborn,0.10.1
matplotlib,3.1.3
Wed Jun 24 19:27:22 2020 W. Europe Daylight Time,Wed Jun 24 19:27:22 2020 W. Europe Daylight Time


# 2)- Loading Dataset

In [8]:
data=pd.read_csv('Womens Clothing E-Commerce Reviews.csv',index_col=[0])
data.shape

(23486, 10)

In [9]:
data.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [10]:
data=data.drop(['Clothing ID','Title'], axis=1)

In [11]:
data.head()

Unnamed: 0,Age,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,33,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,34,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,60,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,50,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,47,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


### 2.2. Missing values check

In [12]:
# Checking rows now
def summary_missing(dataset):
    n_miss = dataset.isnull().sum()
    n_obs = dataset.shape[0]
    n_miss_per = n_miss/n_obs*100
    n_miss_tbl = pd.concat([n_miss, n_miss_per], axis = 1).sort_values(1, ascending = False).round(1)
    n_miss_tbl = n_miss_tbl[n_miss_tbl[1] != 0]
    print('No. of fields: ', dataset.shape[0])
    print('No. of missing fields: ', n_miss_tbl.shape[0])
    n_miss_tbl = n_miss_tbl.rename(columns = {0:'No. of mising Value', 1:'%age of missing Value'})
    return n_miss_tbl

In [13]:
summary_missing(data)

No. of fields:  23486
No. of missing fields:  4


Unnamed: 0,No. of mising Value,%age of missing Value
Review Text,845,3.6
Division Name,14,0.1
Department Name,14,0.1
Class Name,14,0.1


### 2.3.Fixing missing values

**Instead of dropping , we shall fill these null values with 'other'**

In [14]:
data=data.fillna('other')

In [15]:
summary_missing(data)

No. of fields:  23486
No. of missing fields:  0


Unnamed: 0,No. of mising Value,%age of missing Value


# 3)- Exporing Features

### 3.1)- Review Text

In [16]:
data['Review Text'].head(2)

0    Absolutely wonderful - silky and sexy and comf...
1    Love this dress!  it's sooo pretty.  i happene...
Name: Review Text, dtype: object

In [17]:
#converting to list
review_list=data['Review Text'].tolist()
type(review_list)

list

In [18]:
review_list[0]

'Absolutely wonderful - silky and sexy and comfortable'

In [19]:
review_str=' '.join(data['Review Text'].tolist())
type(review_str)

str

In [20]:
review_str[:100]

"Absolutely wonderful - silky and sexy and comfortable Love this dress!  it's sooo pretty.  i happene"

Here each word is acting as string. Classical coders like to convert dataframe --> Series --> list -->str 
Advantage is that in the end text is taken as string and we get clear idea how later stages make sense such as cleaning data and tokenization

In [21]:
data['Review Text'][0][:50]

'Absolutely wonderful - silky and sexy and comforta'

In [22]:
len(data['Review Text'][0])

53

In [23]:
data['Review Text'][1]

'Love this dress!  it\'s sooo pretty.  i happened to find it in a store, and i\'m glad i did bc i never would have ordered it online bc it\'s petite.  i bought a petite and am 5\'8".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite.'

In [24]:
print(' '.join(data['Review Text'].tolist())[:1000])

Absolutely wonderful - silky and sexy and comfortable Love this dress!  it's sooo pretty.  i happened to find it in a store, and i'm glad i did bc i never would have ordered it online bc it's petite.  i bought a petite and am 5'8".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite. I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments! This shirt is ve

In [25]:
data['small_text']= data['Review Text'].str[:50]

In [26]:
data.head()

Unnamed: 0,Age,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,small_text
0,33,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,Absolutely wonderful - silky and sexy and comf...
1,34,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,Love this dress! it's sooo pretty. i happene...
2,60,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,I had such high hopes for this dress and reall...
3,50,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,"I love, love, love this jumpsuit. it's fun, fl..."
4,47,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,This shirt is very flattering to all due to th...


In [27]:
data.tail()

Unnamed: 0,Age,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,small_text
23481,34,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses,I was very happy to snag this dress at such a ...
23482,48,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits,"It reminds me of maternity clothes. soft, stre..."
23483,31,"This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses,"This fit well, but the top was very see throug..."
23484,28,I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses,I bought this dress for a wedding i have this ...
23485,52,This dress in a lovely platinum is feminine an...,5,1,22,General Petite,Dresses,Dresses,This dress in a lovely platinum is feminine an...


In [28]:
data.small_text[0]

'Absolutely wonderful - silky and sexy and comforta'

In [29]:
data.small_text[1]

"Love this dress!  it's sooo pretty.  i happened to"

In [30]:
data.small_text[2]

'I had such high hopes for this dress and really wa'

In [31]:
data['Review Text'][2]

'I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c'