In [1]:
import re

import numpy as np
import pandas as pd

from random import randint, seed
from scipy.stats import shapiro
from itertools import permutations
from tqdm import tqdm

In [2]:
data_path = '../data/modcloth_final_data.json'

In [3]:
df_records = pd.read_json(data_path, lines=True)

print('Number of records = {}'.format(len(df_records)))
print('-'*40)
print('Attribute List:')
print('-'*40)
df_records.columns = [x.replace(" ","_") for x in df_records.columns]
print(df_records.columns)

Number of records = 82790
----------------------------------------
Attribute List:
----------------------------------------
Index(['item_id', 'waist', 'size', 'quality', 'cup_size', 'hips', 'bra_size',
       'category', 'bust', 'height', 'user_name', 'length', 'fit', 'user_id',
       'shoe_size', 'shoe_width', 'review_summary', 'review_text'],
      dtype='object')


In [128]:
print(df_records.shape)
df_records.head(1)

(82790, 18)


Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,bust,height,user_name,length,fit,user_id,shoe_size,shoe_width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36,5ft 6in,Emily,just right,small,991571,,,,


In [5]:
df_records.isna().sum()

item_id               0
waist             79908
size                  0
quality              68
cup_size           6255
hips              26726
bra_size           6018
category              0
bust              70936
height             1107
user_name             0
length               35
fit                   0
user_id               0
shoe_size         54875
shoe_width        64183
review_summary     6725
review_text        6725
dtype: int64

According to the paper,
- fit : is the target variable
- category, shoe_width, item_id, user_id : are treated as categorical and we learn embeddings for representing them
- quality, size, shoe_size, waist, bust, cup_size, bra_size, hips, height : are treated as regular categorical/numerical variables
- user_name, length, review_text and review_summary : are not considered

### Attribute Types

In [4]:
df_records.drop(['user_name', 'length', 'review_summary', 'review_text'], axis=1, inplace=True)
df_records.dtypes

item_id         int64
waist         float64
size            int64
quality       float64
cup_size       object
hips          float64
bra_size      float64
category       object
bust           object
height         object
fit            object
user_id         int64
shoe_size     float64
shoe_width     object
dtype: object

On inspection, 
- `fit` and `category` have no NaNs
- `bust` has too many NaNs, but we can try to calculate it
- `cup_size`, `bra_size` and `height` have some missing values (NaNs) --> try data imputation 
- `waist` and `shoe_width` has a lot of missing values, but we can try to estimate them using conditional distribution

### Missing Data Imputation
Let's check the values

In [7]:
df_records.head()

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,bust,height,fit,user_id,shoe_size,shoe_width
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,small,991571,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,small,587883,,
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,small,395665,9.0,
3,123373,,21,5.0,dd/e,,,new,,,fit,875643,,
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,small,944840,,


In [8]:
df_records.dtypes

item_id         int64
waist         float64
size            int64
quality       float64
cup_size       object
hips          float64
bra_size      float64
category       object
bust           object
height         object
fit            object
user_id         int64
shoe_size     float64
shoe_width     object
dtype: object

### Normal distribution imputation

In [5]:
def flatten(t):
    return [item for sublist in t for item in sublist]

In [8]:
def shapiro_test(data, quantity, values, n, encoder=None, decoder=None):
    if encoder:
        data = [encoder[i] for i in data]
        values = [encoder[i] for i in values]
    chosen = []
    chunks = [quantity[i:i+n] for i in range(0, len(quantity), n)]
    for chunk in chunks:
        max = 0
        buffer = []
        choice = []
        for perm in permutations(values, len(chunk)):
            buffer = flatten([[perm[i]] * chunk[i] for i in range(len(chunk))])
            stats = shapiro(data + buffer).statistic
            if max < stats:
                max = stats
                choice = buffer
        # print(max)
        chosen += choice
        data += choice
    if decoder:
        chosen = [decoder[i] for i in chosen]
    print(max)
    return chosen

In [9]:
df_records['cup_size'] = df_records['cup_size'].replace('dd/e','e').replace('ddd/f','f').replace('dddd/g','g')
encoder = {"aa":0, "a":2, "b":4, "c":6, "d":8, "e":10, "f":12, "g":14, "h":16, "i":18, "j":20, "k":22}
decoder = {0:"aa", 2:"a", 4:"b", 6:"c", 8:"d", 10:"e", 12:"f", 14:"g", 16:"h", 18:"i", 20:"j", 22:"k"}

normal_columns = ['cup_size', 'hips', 'bra_size']
for col in normal_columns:
    test_subjects = df_records[df_records[col].isnull()]['user_id'].value_counts()
    natives = df_records[df_records[col].notnull()][['user_id',col]]
    data = df_records[df_records[col].notnull()][col]
    uniq = data.unique()

    if col == 'cup_size':
        shap = shapiro_test(list(data), [test_subjects[i] for i in test_subjects[test_subjects > 1].keys()], uniq, 1, encoder, decoder)
    else:
        shap = shapiro_test(list(data), [test_subjects[i] for i in test_subjects[test_subjects > 1].keys()], uniq, 1)

    dicto = {idx: shap[i] for i, idx in enumerate(test_subjects[test_subjects > 1].keys())}
    dicto.update({idx: uniq[i%len(uniq)] for i, idx in enumerate(test_subjects[test_subjects <= 1].keys())})
    dicto.update({natives['user_id'][i]: natives[col][i] for i in natives.index})
    change = [dicto[idx] for idx in df_records['user_id']]
    df_records[col] = change



0.9423934817314148




0.9557841420173645




0.9204141497612


### Median imputation

In [10]:
from sklearn.impute import SimpleImputer
median_imputer = SimpleImputer(missing_values=np.nan, strategy='median')
frequent_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [11]:
numeric_columns = ['quality']
string_columns = ['height']

for col in numeric_columns:
    df_records[col] = median_imputer.fit_transform(np.array(df_records[col]).reshape(-1,1)).squeeze()
    
for col in string_columns:
    df_records[col] = frequent_imputer.fit_transform(np.array(df_records[col]).reshape(-1,1)).squeeze()

df_records.isna().sum()

item_id           0
waist         79908
size              0
quality           0
cup_size          0
hips              0
bra_size          0
category          0
bust          70936
height            0
fit               0
user_id           0
shoe_size     54875
shoe_width    64183
dtype: int64

In [117]:
df_records.head()

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,bust,height,fit,user_id,shoe_size,shoe_width
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,small,991571,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,small,587883,,
2,123373,30.0,7,2.0,b,30.0,32.0,new,,5ft 7in,small,395665,9.0,
3,123373,,21,5.0,e,42.0,46.0,new,,5ft 4in,fit,875643,,
4,123373,,18,5.0,b,40.0,36.0,new,,5ft 2in,small,944840,,


### Calculating bust
Looks like bust measurements are being calculated in the US style, to find correlation I've used this pick:

![alt text](table.png "Title")

In [12]:
df_records["bra_size"].unique()

array([34., 36., 32., 46., 38., 42., 40., 44., 30., 48., 28.])

In [13]:
df_records["cup_size"].unique()

array(['d', 'b', 'e', 'c', 'f', 'g', 'i', 'a', 'k', 'h', 'aa', 'j'],
      dtype=object)

In [14]:
def bust_calculation(cup_size, bra_size):
    dicter = {"aa":0, "a":2, "b":4, "c":6, "d":8, "e":10, "f":12, "g":14, "h":16, "i":18, "j":20, "k":22}
    return [71 + (int(bra_size[i]) - 28)*2.5 + dicter[cup_size[i]] for i in range(len(cup_size))]

In [15]:
df_records['bust'] = bust_calculation(df_records['cup_size'], df_records['bra_size'])

seed(0)
# natural_diff = [round(uniform(-0.99, 0.99), 2) for i in range(len(df_records))]
diff = {idx: randint(-1,1)/2 for idx in df_records['user_id'].unique()}
natural_diff = [diff[idx] for idx in df_records['user_id']]
df_records['bust'] = df_records['bust'] + natural_diff

In [16]:
df_records.isna().sum()

item_id           0
waist         79908
size              0
quality           0
cup_size          0
hips              0
bra_size          0
category          0
bust              0
height            0
fit               0
user_id           0
shoe_size     54875
shoe_width    64183
dtype: int64

In [13]:
df_records.head()

Unnamed: 0,item_id,waist,size,quality,cup_size,hips,bra_size,category,bust,height,fit,user_id,shoe_size,shoe_width
0,123373,29.0,7,5.0,d,38.0,34.0,new,94.0,5ft 6in,small,991571,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,95.0,5ft 2in,small,587883,,
2,123373,30.0,7,2.0,b,39.0,32.0,new,84.5,5ft 7in,small,395665,9.0,
3,123373,,21,5.0,e,39.0,36.0,new,101.0,5ft 4in,fit,875643,,
4,123373,,18,5.0,b,39.0,36.0,new,95.5,5ft 2in,small,944840,,


### Converting height to numeric column

In [17]:
def get_float_height(height):
    """
    Convert string height to float (metres)
    """
    
    output = height.split(' ')
    if len(output) == 2:
        feet, inches = output[0], output[1]
    else:
        feet, inches = output[0], "0in"
    feet = float(re.findall('\d+', feet)[0])
    inches = float(re.findall('\d+', inches)[0])
    
    metres = (feet * 0.3048) + (inches * 0.0254)
    
    return metres

In [18]:
df_records['height'] = df_records['height'].apply(get_float_height)

### Normalize Numeric Features

In [140]:
from sklearn.preprocessing import StandardScaler

numeric_columns = ['waist', 'size', 'quality', 'hips', 'bra_size', 'height', 'bust', 'shoe_size']
scaler = StandardScaler()
df_records_numeric = pd.DataFrame(scaler.fit_transform(df_records.loc[:, numeric_columns]), 
                                  columns=numeric_columns)
df_records_numeric.head()

Unnamed: 0,waist,size,quality,hips,bra_size,height,bust,shoe_size
0,-0.43743,-0.684438,1.058979,-0.454521,-0.596667,0.306343,-0.46001,
1,-0.060209,0.040909,-0.956397,-1.678629,-0.011992,-1.1039,-0.359882,
2,-0.248819,-0.684438,-1.964085,-1.678629,-1.181343,0.658904,-1.41122,0.639317
3,,1.008039,1.058979,0.157532,2.911388,-0.398778,2.744066,
4,,0.645365,1.058979,-0.148495,-0.011992,-1.1039,-0.309819,


### Convert Categorical Features into Labels

In [141]:
from sklearn.preprocessing import OrdinalEncoder

categorical_columns = ['item_id', 'category', 'cup_size', 'user_id', 'fit']
ordinal_enc = OrdinalEncoder()

df_records_categorical = pd.DataFrame(np.array(ordinal_enc.fit_transform(df_records.loc[:, categorical_columns]), 
                                           dtype=np.int64), columns=categorical_columns)
df_records_categorical.head()

Unnamed: 0,item_id,category,cup_size,user_id,fit
0,0,2,4,47557,2
1,0,2,2,28324,2
2,0,2,2,19037,2
3,0,2,5,42142,0
4,0,2,2,45350,2


In [142]:
df = pd.concat([df_records_numeric, df_records_categorical], axis=1)
df.head(3)

Unnamed: 0,waist,size,quality,hips,bra_size,height,bust,shoe_size,item_id,category,cup_size,user_id,fit
0,-0.43743,-0.684438,1.058979,-0.454521,-0.596667,0.306343,-0.46001,,0,2,4,47557,2
1,-0.060209,0.040909,-0.956397,-1.678629,-0.011992,-1.1039,-0.359882,,0,2,2,28324,2
2,-0.248819,-0.684438,-1.964085,-1.678629,-1.181343,0.658904,-1.41122,0.639317,0,2,2,19037,2


In [143]:
# Number of embeddings required for each categorical variable
for col in df_records_categorical.columns:
    print(col, ':', len(df_records_categorical[col].unique()))

item_id : 1378
category : 7
cup_size : 12
user_id : 47958
fit : 3


### Saving the pre-processed data

In [18]:
# Create random train-validation-test splits
# 80-10-10 as in the paper
from sklearn.model_selection import train_test_split
df_train, df_val_test = train_test_split(df, random_state=10, test_size=0.2)
df_val, df_test = train_test_split(df_val_test, random_state=10, test_size=0.5)

In [20]:
save_file = '../data/modcloth_final_data_processed'

df_train.to_json(save_file+'_train.jsonl', orient='records', lines=True)
df_val.to_json(save_file+'_valid.jsonl', orient='records', lines=True)
df_test.to_json(save_file+'_test.jsonl', orient='records', lines=True)