In [1]:
import re

import numpy as np
import pandas as pd

from tqdm import tqdm

In [2]:
data_path = '../data/modcloth_final_data.json'

In [3]:
df_records = pd.read_json(data_path, lines=True)

print('Number of records = {}'.format(len(df_records)))
print('-'*40)
print('Attribute List:')
print('-'*40)
df_records.columns = [x.replace(" ","_") for x in df_records.columns]
print(df_records.columns)

Number of records = 82790
----------------------------------------
Attribute List:
----------------------------------------
Index(['item_id', 'waist', 'size', 'quality', 'cup_size', 'hips', 'bra_size',
       'category', 'bust', 'height', 'user_name', 'length', 'fit', 'user_id',
       'shoe_size', 'shoe_width', 'review_summary', 'review_text'],
      dtype='object')


In [4]:
print(df_records.shape)
df_records.head(1)

(82790, 18)


Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,bust,height,user_name,length,fit,user_id,shoe_size,shoe_width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36,5ft 6in,Emily,just right,small,991571,,,,


### Attribute Types

According to the paper,
- fit : is the target variable
- category, shoe_width, item_id, user_id : are treated as categorical and we learn embeddings for representing them
- quality, size, shoe_size, waist, bust, cup_size, bra_size, hips, height : are treated as regular categorical/numerical variables
- user_name, length, review_summary, review_text : are not considered

In [5]:
df_records.drop(['user_name', 'length', 'review_summary', 'review_text'], axis=1, inplace=True)
df_records.dtypes

item_id         int64
waist         float64
size            int64
quality       float64
cup_size       object
hips          float64
bra_size      float64
category       object
bust           object
height         object
fit            object
user_id         int64
shoe_size     float64
shoe_width     object
dtype: object

On inspection, 
- `fit` and `category` have no NaNs
- `bust` and `shoe_width` have too many NaNs --> dropping those columns for simplicity
- `cup_size` and `height` have missing values (NaNs) --> try data imputation 


In [6]:
df_records.drop(['bust', 'shoe_width'], axis=1, inplace=True)

### Missing Data Imputation
- For numeric (float/int) columns, replace with median value
- For others, replace with most frequent value
- Can be replaced with more complex strategies

In [7]:
df_records.head()

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,height,fit,user_id,shoe_size
0,123373,29.0,7,5.0,d,38.0,34.0,new,5ft 6in,small,991571,
1,123373,31.0,13,3.0,b,30.0,36.0,new,5ft 2in,small,587883,
2,123373,30.0,7,2.0,b,,32.0,new,5ft 7in,small,395665,9.0
3,123373,,21,5.0,dd/e,,,new,,fit,875643,
4,123373,,18,5.0,b,,36.0,new,5ft 2in,small,944840,


In [8]:
df_records.dtypes

item_id        int64
waist        float64
size           int64
quality      float64
cup_size      object
hips         float64
bra_size     float64
category      object
height        object
fit           object
user_id        int64
shoe_size    float64
dtype: object

In [9]:
from sklearn.impute import SimpleImputer
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
frequent_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [10]:
numeric_columns = ['waist', 'size', 'quality', 'hips', 'bra_size', 'shoe_size']
string_columns = ['cup_size', 'height']

for col in numeric_columns:
    df_records[col] = median_imputer.fit_transform(np.array(df_records[col]).reshape(-1,1)).squeeze()
    
for col in string_columns:
    df_records[col] = frequent_imputer.fit_transform(np.array(df_records[col]).reshape(-1,1)).squeeze()

In [11]:
df_records.head()

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,height,fit,user_id,shoe_size
0,123373,29.0,7.0,5.0,d,38.0,34.0,new,5ft 6in,small,991571,8.0
1,123373,31.0,13.0,3.0,b,30.0,36.0,new,5ft 2in,small,587883,8.0
2,123373,30.0,7.0,2.0,b,39.0,32.0,new,5ft 7in,small,395665,9.0
3,123373,30.0,21.0,5.0,dd/e,39.0,36.0,new,5ft 4in,fit,875643,8.0
4,123373,30.0,18.0,5.0,b,39.0,36.0,new,5ft 2in,small,944840,8.0


#### Converting height to numeric column

In [12]:
def get_float_height(height):
    """
    Convert string height to float (metres)
    """
    
    output = height.split(' ')
    if len(output) == 2:
        feet, inches = output[0], output[1]
    else:
        feet, inches = output[0], "0in"
    feet = float(re.findall('\d+', feet)[0])
    inches = float(re.findall('\d+', inches)[0])
    
    metres = (feet * 0.3048) + (inches * 0.0254)
    
    return metres

In [13]:
df_records['height'] = df_records['height'].apply(get_float_height).head()

### Normalize Numeric Features

In [14]:
from sklearn.preprocessing import StandardScaler

numeric_columns = ['waist', 'size', 'quality', 'hips', 'bra_size', 'height', 'shoe_size']
scaler = StandardScaler()
df_records_numeric = pd.DataFrame(scaler.fit_transform(df_records.loc[:, numeric_columns]), 
                                  columns=numeric_columns)
df_records_numeric.head()

Unnamed: 0,waist,size,quality,hips,bra_size,height,shoe_size
0,-1.027082,-0.684438,1.058979,-0.396924,-0.6357,0.882523,-0.063125
1,0.936889,0.040909,-0.956397,-2.050813,0.008324,-1.078639,-0.063125
2,-0.045096,-0.684438,-1.964085,-0.190188,-1.279724,1.372813,1.220767
3,-0.045096,1.008039,1.058979,-0.190188,0.008324,-0.098058,-0.063125
4,-0.045096,0.645365,1.058979,-0.190188,0.008324,-1.078639,-0.063125


### Convert Categorical Features into Labels

In [16]:
from sklearn.preprocessing import OrdinalEncoder

categorical_columns = ['item_id', 'category', 'cup_size', 'user_id', 'fit']
ordinal_enc = OrdinalEncoder()

df_records_categorical = pd.DataFrame(np.array(ordinal_enc.fit_transform(df_records.loc[:, categorical_columns]), 
                                           dtype=np.int64), columns=categorical_columns)
df_records_categorical.head()

Unnamed: 0,item_id,category,cup_size,user_id,fit
0,0,2,4,47557,2
1,0,2,2,28324,2
2,0,2,2,19037,2
3,0,2,5,42142,0
4,0,2,2,45350,2


In [17]:
df_records = pd.concat([df_records_numeric, df_records_categorical], axis=1)
df_records.head(3)

Unnamed: 0,waist,size,quality,hips,bra_size,height,shoe_size,item_id,category,cup_size,user_id,fit
0,-1.027082,-0.684438,1.058979,-0.396924,-0.6357,0.882523,-0.063125,0,2,4,47557,2
1,0.936889,0.040909,-0.956397,-2.050813,0.008324,-1.078639,-0.063125,0,2,2,28324,2
2,-0.045096,-0.684438,-1.964085,-0.190188,-1.279724,1.372813,1.220767,0,2,2,19037,2


In [18]:
# Number of embeddings required for each categorical variable
for col in df_records_categorical.columns:
    print(col, ':', len(df_records_categorical[col].unique()))

item_id : 1378
category : 7
cup_size : 12
user_id : 47958
fit : 3


### Saving the pre-processed data

In [19]:
# Create random train-validation-test splits
# 80-10-10 as in the paper
from sklearn.model_selection import train_test_split
df_train, df_val_test = train_test_split(df_records, random_state=10, test_size=0.2)
df_val, df_test = train_test_split(df_val_test, random_state=10, test_size=0.5)

In [20]:
save_file = '../data/modcloth_final_data_processed'

df_train.to_json(save_file+'_train.jsonl', orient='records', lines=True)
df_val.to_json(save_file+'_valid.jsonl', orient='records', lines=True)
df_test.to_json(save_file+'_test.jsonl', orient='records', lines=True)