
# Predicting Customer Satisfaction on Rent the Runway

## III. Modeling (Neural Network) 
### Katrin Ayrapetov


<font style="font-size: 2rem; color: blue">


 
</font>

In [81]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from sklearn import model_selection
import matplotlib.pyplot as plt

In [46]:
from numpy.random import seed
seed(42)

In [47]:
#Import the Data Set 
df =  pd.read_csv('../Data/df_clean.csv')

In [48]:
#Binarize the Predictor Variable: Satisfaction Rating 
# 1: Not Satisfied with Rental (Satisfaction Rating of 1,2,3) 
# 0: Satisfied with Rental (Satisfaction Rating 4,5)
df['Rating'] = np.where(df['Rating'] <= 3, 1, 0)

In [49]:
#We are dealing with an unbalanced class. 
df["Rating"].value_counts(normalize=True)

0    0.832324
1    0.167676
Name: Rating, dtype: float64

In [50]:
df.head()

Unnamed: 0,Type_of_Customer,Size,Overall_fit,Rented_for,Size_usually_worn,Height,Age,Bust_size,Body_type,Weight,Rating,Date,Brand,Retail_price,Rent_price,Number_of_reviews,BMI,Sleeves,Neckline,Dress_Style
0,TOP CONTRIBUTOR,S,Overall fit: True to Size,Vacation,8,66,31,34C,athletic,142,0,spring,Tory Burch,478,70,33,22.916896,sleeveless,square_neckline,hourglass
1,TOP CONTRIBUTOR,M,Overall fit: Large,Wedding,6,67,33,36C,hourglass,150,0,spring,Tory Burch,478,70,33,23.490755,sleeveless,square_neckline,hourglass
2,TOP CONTRIBUTOR,S,Overall fit: Large,Everyday,6,66,27,34B,pear,140,0,spring,Tory Burch,478,70,33,22.594123,sleeveless,square_neckline,hourglass
3,TOP CONTRIBUTOR,L,Overall fit: True to Size,unknown,12,66,42,36B,pear,181,0,spring,Tory Burch,478,70,33,29.210973,sleeveless,square_neckline,hourglass
4,TOP CONTRIBUTOR,L,Overall fit: True to Size,Everyday,12,70,48,34D,pear,165,0,spring,Tory Burch,478,70,33,23.672449,sleeveless,square_neckline,hourglass


In [51]:
#Create a target variable vector. 
y = df.Rating.values

In [52]:
#Because the target class is unbalanced, use the k-fold method to create 5 k-folds. 

#initiate the kfold class from model_selection module
#The folds are made by preserving the percentage of samples for each class.
kf = model_selection.StratifiedKFold(n_splits=5)

#Create a column to hold kfold labels 
df["kfold"] = -1

#Reshuffle the rows. 
df = df.sample(frac=1).reset_index(drop=True)

for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,"kfold"] = f

In [53]:
#Check that the same number of observations is in each fold. 
df.kfold.value_counts()

0    31287
1    31287
2    31287
3    31286
4    31286
Name: kfold, dtype: int64

In [54]:
#Check that the target variable has the same distribution  the same in each fold. 
for k in range(5):
    print(f"fold: k = {k}")
    print(df[df.kfold==k].Rating.value_counts(normalize=True))

fold: k = 0
0    0.832007
1    0.167993
Name: Rating, dtype: float64
fold: k = 1
0    0.831464
1    0.168536
Name: Rating, dtype: float64
fold: k = 2
0    0.832934
1    0.167066
Name: Rating, dtype: float64
fold: k = 3
0    0.833376
1    0.166624
Name: Rating, dtype: float64
fold: k = 4
0    0.831842
1    0.168158
Name: Rating, dtype: float64


In [55]:
#Drop Unnecessary Columns 
df.drop(columns=["Retail_price"],inplace=True)
df.dropna(inplace=True)

In [56]:
df = df.reset_index()
df.drop(columns="index",inplace=True)

In [57]:
#Convert the numeric columns to categories split along the 10 percentiles.
num_columns = ['Height', 'Age','Weight','Number_of_reviews','Rent_price']
for column in num_columns: 
    df[f"{column}_binned"] = pd.qcut(x = df[column], q = 10, labels=False, retbins=False, precision=3, duplicates='drop')

In [58]:
df.head()

Unnamed: 0,Type_of_Customer,Size,Overall_fit,Rented_for,Size_usually_worn,Height,Age,Bust_size,Body_type,Weight,...,BMI,Sleeves,Neckline,Dress_Style,kfold,Height_binned,Age_binned,Weight_binned,Number_of_reviews_binned,Rent_price_binned
0,TOP CONTRIBUTOR,10,Overall fit: True to Size,Work,8,64,52,36C,pear,158,...,27.117676,short_sleeves,crew_neckline,hourglass,0,2,9,7,8,6
1,TOP CONTRIBUTOR,2,Overall fit: True to Size,unknown,2,69,36,34DD,hourglass,130,...,19.195547,cap_sleeves,crew_neckline,sheath,0,7,4,2,1,3
2,unknown,10,Overall fit: True to Size,Formal Affair,10,67,27,34B,pear,180,...,28.188906,sleeveless,v_neckline,sleeveless,0,5,0,9,9,4
3,TOP CONTRIBUTOR,0,Overall fit: True to Size,Work,2,64,30,34C,unknown,120,...,20.595703,sleeveless,v_neckline,sleeveless,0,2,1,1,9,1
4,TOP CONTRIBUTOR,0,Overall fit: Large,Party,0,66,33,30C,hourglass,115,...,18.559458,short_sleeves,off_shoulder,sheath,0,4,3,0,1,0


In [61]:
# This function creates the Neural Network model using Entity embedding. 
def create_model(data, catcols):
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    y = layers.Dense(2, activation="softmax")(x)
    model = Model(inputs=inputs, outputs=y)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [64]:
# This function label encodes categorical features. It fits the training data with a model. 
# then uses the model to make predictions on the submissions data set. 
def neural_network_model(fold):
   
    #list of numeric columns 
    num_cols = ["Height","Age","Weight","Rent_price","Number_of_reviews","BMI"]
    #List of categorical columns 
    #cat_cols = [c for c in df.columns if c not in num_cols and c not in ("kfold", "Rating")]
    #All columns are features except the target column and the kfold column 
    features = [c for c in df.columns if c not in num_cols and c not in ("kfold", "Rating")]

    
    #convert the categorical variables to strings
    for col in features:
        df.loc[:,col] = df[col].astype(str)  
    #Use label encoder on categorical features. 
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:,feat]=lbl_enc.fit_transform(df[feat].values)
    
        
    #Hold one of the five folds as validation set and four folds as training sets 
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    model = create_model(df,features)
    
    # Features 
    xtrain = [
     df_train[features].values[:, k] for k in range(len(features))
    ]
    xvalid = [
    df_valid[features].values[:, k] for k in range(len(features))
    ]

    # get target columns
    ytrain = df_train.Rating.values
    yvalid = df_valid.Rating.values
    
    # convert target columns to categories

    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)
    
    # fit the model
    model.fit(xtrain,
    ytrain_cat,
    validation_data=(xvalid, yvalid_cat),
    verbose=0,
    batch_size=32,
    epochs=100)
    
    valid_preds = model.predict(xvalid)[:, 1]
    
    train_preds = model.predict(xtrain)[:,1]
    
    auc_valid = metrics.roc_auc_score(yvalid, valid_preds)
    auc_train = metrics.roc_auc_score(ytrain, train_preds)
    
    print(f"Fold = {fold}, AUC_train = {auc_train}, AUC_test = {auc_valid}")
    return ytrain, train_preds, yvalid, valid_preds


In [122]:
ytrain_3, train_preds_3, yvalid_3, valid_preds_3 = neural_network_model(3)

Fold = 3, AUC_train = 0.8512947184044364, AUC_test = 0.8182177437753548


In [123]:
df_train_results_3= pd.DataFrame(data=[ytrain_3, train_preds_3, yvalid_3, valid_preds_3]).T
df_train_results_3.columns=["True_Train_Classes","Pred_Train_Classes","True_Test_Classes","Pred_Test_Classes"]

In [124]:
df_train_results_3['Pred_Train_Classes_binarized'] = np.where(df_train_results_3['Pred_Train_Classes'] >= 0.5, 1, 0)

In [125]:
df_train_results_3['Pred_Test_Classes_binarized'] = np.where(df_train_results_3['Pred_Test_Classes'] >= 0.5, 1, 0)

In [126]:
df_train_results_3.head(5)

Unnamed: 0,True_Train_Classes,Pred_Train_Classes,True_Test_Classes,Pred_Test_Classes,Pred_Train_Classes_binarized,Pred_Test_Classes_binarized
0,0.0,0.010143,0.0,0.023038,0,0
1,0.0,0.011305,0.0,0.006041,0,0
2,0.0,0.000634,0.0,0.006839,0,0
3,0.0,0.002939,1.0,0.046473,0,0
4,1.0,0.292178,0.0,0.00093,0,0


In [127]:
accuracy_train = metrics.accuracy_score(df_train_results_3.True_Train_Classes.values,
                                  df_train_results_3.Pred_Train_Classes_binarized.values)
accuracy_test = metrics.accuracy_score(df_train_results_3.True_Test_Classes.values,
                                       df_train_results_3.Pred_Test_Classes_binarized.values)

0.8723181538510711


In [None]:
print(f"Training Set Accuracy: {accuracy_train}")
print(f"Test Set Accuracy: {test_train}")

In [None]:
recall_train = metrics.recall_score(df_train_results_3.True_Train_Classes.values,
                                  df_train_results_3.Pred_Train_Classes_binarized.values)
recall_test = metrics.recall_score(df_train_results_3.True_Test_Classes.values,
                                       df_train_results_3.Pred_Test_Classes_binarized.values)

In [None]:
print(f"Training Set Recall: {recall_train}")
print(f"Test Set Recall: {recall_test}")

In [None]:
precision_train = metrics.precision_score(df_train_results_3.True_Train_Classes.values,
                                  df_train_results_3.Pred_Train_Classes_binarized.values)
precision_test = metrics.precision_score(df_train_results_3.True_Test_Classes.values,
                                       df_train_results_3.Pred_Test_Classes_binarized.values)