# Text Analysis for Women's E-Commerce Clothing Reviews

## Libraries and Settings

In [15]:
import os
import numpy as np
import pandas as pd
import pandera as pdv
from pandera import Column, Check, DataFrameSchema

%config Completer.use_jedi = False

## Variables

In [25]:
RAW_DIR = os.path.join(os.pardir, 'data', 'raw')
RANDOM_STATE = 8

## Helpers

In [3]:
def load_data(path, filename):
    data_path = os.path.join(path, filename)
    df = pd.read_csv(data_path)
    return df

## Load data

In [4]:
raw = load_data(RAW_DIR, 'Womens Clothing E-Commerce Reviews.csv')

In [5]:
raw.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [6]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


## Data Validation

Here a very simple example of data validation using [Pandera](https://pandera.readthedocs.io/en/stable/) library

### Define Schema

In [51]:
COLS = raw.columns.difference(['Unnamed: 0'])

# Define checks
check_ge_min = Check(lambda s: s >= 0)
check_le_max = Check(lambda s: s <= max(s))

schema = DataFrameSchema(
    
    {k : Column(pdv.Int, checks = [check_ge_min, check_le_max]) if pd.api.types.is_int64_dtype(np.array(raw[k])) 
     else Column(pdv.String, nullable=True) 
     for k in COLS}
)

In [52]:
schema

<Schema DataFrameSchema(columns={'Age': <Schema Column(name=Age, type=int)>, 'Class Name': <Schema Column(name=Class Name, type=str)>, 'Clothing ID': <Schema Column(name=Clothing ID, type=int)>, 'Department Name': <Schema Column(name=Department Name, type=str)>, 'Division Name': <Schema Column(name=Division Name, type=str)>, 'Positive Feedback Count': <Schema Column(name=Positive Feedback Count, type=int)>, 'Rating': <Schema Column(name=Rating, type=int)>, 'Recommended IND': <Schema Column(name=Recommended IND, type=int)>, 'Review Text': <Schema Column(name=Review Text, type=str)>, 'Title': <Schema Column(name=Title, type=str)>}, checks=[], index=None, coerce=False, pandas_dtype=None,strict=False,name=None,ordered=False)>

### Validate data

In [53]:
val_sample = raw.drop('Unnamed: 0', axis = 1).sample(n=100, random_state = RANDOM_STATE)

In [54]:
val_df = schema.validate(validate_sample)

In [56]:
is_not_valid = val_df.empty
if not is_not_valid:
    print('Data Validation successfully completed!')

Data Validation successfully completed!


# Comments