# Text Analysis for Women's E-Commerce Clothing Reviews

## Libraries and Settings

In [1]:
import os
import numpy as np
import pandas as pd
import pandera as pdv
from pandera import Column, Check, DataFrameSchema

%config Completer.use_jedi = False

## Variables

In [2]:
RAW_DIR = os.path.join(os.pardir, 'data', 'raw')
RANDOM_STATE = 8

## Helpers

In [3]:
def load_data(path, filename):
    data_path = os.path.join(path, filename)
    df = pd.read_csv(data_path)
    return df

## Load data

In [4]:
raw = load_data(RAW_DIR, 'Womens Clothing E-Commerce Reviews.csv')

In [5]:
raw.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [6]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


## Data Validation

Here a very simple example of data validation using [Pandera](https://pandera.readthedocs.io/en/stable/) library

In [7]:
raw.columns

Index(['Unnamed: 0', 'Clothing ID', 'Age', 'Title', 'Review Text', 'Rating',
       'Recommended IND', 'Positive Feedback Count', 'Division Name',
       'Department Name', 'Class Name'],
      dtype='object')

### Define Schema

In [8]:
COLS = raw.columns.difference(['Unnamed: 0', 'Clothing ID'])

# Define checks
check_ge_min = Check(lambda s: s >= 0)
check_le_max = Check(lambda s: s <= max(s))

schema = DataFrameSchema(
    {
        'Age': Column(pdv.Int, checks = [check_ge_min, check_le_max]),
        'Title': Column(pdv.String, nullable=True),
        'Review Text': Column(pdv.String, nullable=True), 
        'Rating': Column(pdv.Int, checks = [check_ge_min, check_le_max]),
        'Recommended IND': Column(pdv.Int, checks = [check_ge_min, check_le_max]),
        'Positive Feedback Count': Column(pdv.Int, checks = [check_ge_min, check_le_max]),
        'Division Name': Column(pdv.String, nullable=True),
        'Department Name': Column(pdv.String, nullable=True),
        'Class Name': Column(pdv.String, nullable=True)
    }
)

In [9]:
schema

<Schema DataFrameSchema(columns={'Age': <Schema Column(name=Age, type=int)>, 'Title': <Schema Column(name=Title, type=str)>, 'Review Text': <Schema Column(name=Review Text, type=str)>, 'Rating': <Schema Column(name=Rating, type=int)>, 'Recommended IND': <Schema Column(name=Recommended IND, type=int)>, 'Positive Feedback Count': <Schema Column(name=Positive Feedback Count, type=int)>, 'Division Name': <Schema Column(name=Division Name, type=str)>, 'Department Name': <Schema Column(name=Department Name, type=str)>, 'Class Name': <Schema Column(name=Class Name, type=str)>}, checks=[], index=None, coerce=False, pandas_dtype=None,strict=False,name=None,ordered=False)>

### Validate data

In [10]:
val_sample = raw.drop(['Unnamed: 0', 'Clothing ID'], axis = 1).sample(n=100, random_state = RANDOM_STATE)

In [11]:
val_df = schema.validate(val_sample)

In [12]:
val_df

Unnamed: 0,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
20159,44,Nice surprise,I took a risk on this one. i wasn't sure if th...,4,1,0,General Petite,Tops,Knits
1796,37,,Buy this skit- it's everything that retailer u...,5,1,0,General Petite,Bottoms,Skirts
14406,38,Perfect pants,I'm so in love with these pants. they are so f...,5,1,1,General Petite,Bottoms,Pants
17498,24,"Almost perfect, but the top is unflattering.","The dress had great potential, but the top sag...",3,0,0,General Petite,Dresses,Dresses
18634,36,Shape odd,"The shape was not as nice on model, seemed lik...",3,0,1,General,Dresses,Dresses
...,...,...,...,...,...,...,...,...,...
17985,50,"Ok shirt, white is off white","As with many of retailer's tops these days, th...",3,1,0,General,Tops,Knits
2536,53,Gorgeous and unique,This top is really beautiful. my girlfriend sa...,5,1,0,General,Tops,Blouses
20851,23,New favorite shirt,I love how pretty and soft this shirt is. it f...,5,1,1,General,Tops,Knits
23046,67,Perfect dressy jogger!,These joggers are so comfortable. nice and lig...,5,1,0,General,Bottoms,Pants


In [13]:
validation_status = val_df.empty
if not validation_status:
    print('Data Validation successfully completed!')

Data Validation successfully completed!


# Comments