
# Predicting Customer Satisfaction on Rent the Runway

## VI. Modeling (Neural Network) 
### Katrin Ayrapetov


<font style="font-size: 2rem; color: blue">


 
</font>

### Overview of the Notebook: 

#### In this notebook a Neural Network is built to classify the data.  
**Feature Variables:** Type_of_Customer, Size, Overall_fit, Rented_for,Size_usually_worn, Height, Age, Bust_size, Body_type,
Weight, Date, Brand, Rent_price, Number_of_reviews,
       BMI, Sleeves, Neckline, Dress_Style, kfold, Height_binned,
       Age_binned, Weight_binned, Rent_price_binned,
       Number_of_reviews_binned, BMI_binned


**Target Variable:**
1: Not Satisfied with Rental (Satisfaction Rating of 1,2,3 stars) 
0: Satisfied with Rental (Satisfaction Rating 4,5 stars)

<br> **For the Neural Network**
<br>&emsp;&emsp;The numerical data was turned into categorical by binning along the 10 percentiles.  
<br>&emsp;&emsp;Entity embedding was used on all the features. 
<br>&emsp;&emsp;An Additional Batch Normalization Layer was added. 


<br> **Metrics for the Network**
<br>&emsp;&emsp; Accuracy:  0.8479
<br>&emsp;&emsp; ROC AUC: 0.81281
<br>&emsp;&emsp; Precision:  0.5581
<br>&emsp;&emsp; Recall:  0.51473


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from sklearn import model_selection
import matplotlib.pyplot as plt

In [2]:
from numpy.random import seed
seed(42)

In [3]:
#Import the Data Set 
df =  pd.read_csv('../Data/df_clean.csv')

In [4]:
#Binarize the Predictor Variable: Satisfaction Rating 
# 1: Not Satisfied with Rental (Satisfaction Rating of 1,2,3) 
# 0: Satisfied with Rental (Satisfaction Rating 4,5)
df['Rating'] = np.where(df['Rating'] <= 3, 1, 0)

In [5]:
#We are dealing with an unbalanced class. 
df["Rating"].value_counts(normalize=True)

0    0.832324
1    0.167676
Name: Rating, dtype: float64

In [6]:
df.head()

Unnamed: 0,Type_of_Customer,Size,Overall_fit,Rented_for,Size_usually_worn,Height,Age,Bust_size,Body_type,Weight,Rating,Date,Brand,Retail_price,Rent_price,Number_of_reviews,BMI,Sleeves,Neckline,Dress_Style
0,TOP CONTRIBUTOR,S,Overall fit: True to Size,Vacation,8,66,31,34C,athletic,142,0,spring,Tory Burch,478,70,33,22.916896,sleeveless,square_neckline,hourglass
1,TOP CONTRIBUTOR,M,Overall fit: Large,Wedding,6,67,33,36C,hourglass,150,0,spring,Tory Burch,478,70,33,23.490755,sleeveless,square_neckline,hourglass
2,TOP CONTRIBUTOR,S,Overall fit: Large,Everyday,6,66,27,34B,pear,140,0,spring,Tory Burch,478,70,33,22.594123,sleeveless,square_neckline,hourglass
3,TOP CONTRIBUTOR,L,Overall fit: True to Size,unknown,12,66,42,36B,pear,181,0,spring,Tory Burch,478,70,33,29.210973,sleeveless,square_neckline,hourglass
4,TOP CONTRIBUTOR,L,Overall fit: True to Size,Everyday,12,70,48,34D,pear,165,0,spring,Tory Burch,478,70,33,23.672449,sleeveless,square_neckline,hourglass


In [9]:
#Create a target variable vector. 
y = df.Rating.values

In [10]:
#Because the target class is unbalanced, use the k-fold method to create 5 k-folds. 

#initiate the kfold class from model_selection module
#The folds are made by preserving the percentage of samples for each class.
kf = model_selection.StratifiedKFold(n_splits=5)

#Create a column to hold kfold labels 
df["kfold"] = -1

#Reshuffle the rows. 
df = df.sample(frac=1).reset_index(drop=True)

for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,"kfold"] = f

In [11]:
#Check that the same number of observations is in each fold. 
df.kfold.value_counts()

0    31287
1    31287
2    31287
3    31286
4    31286
Name: kfold, dtype: int64

In [12]:
#Check that the target variable has the same distribution  the same in each fold. 
for k in range(5):
    print(f"fold: k = {k}")
    print(df[df.kfold==k].Rating.value_counts(normalize=True))

fold: k = 0
0    0.832454
1    0.167546
Name: Rating, dtype: float64
fold: k = 1
0    0.83124
1    0.16876
Name: Rating, dtype: float64
fold: k = 2
0    0.834212
1    0.165788
Name: Rating, dtype: float64
fold: k = 3
0    0.829732
1    0.170268
Name: Rating, dtype: float64
fold: k = 4
0    0.833983
1    0.166017
Name: Rating, dtype: float64


In [13]:
#Drop Unnecessary Columns 
df.drop(columns=["Retail_price"],inplace=True)
df.dropna(inplace=True)
df = df.reset_index()
df.drop(columns="index",inplace=True)

In [14]:
#Convert the numeric columns to categories split along the 10 percentiles.
num_columns = ['Height', 'Age', 'Weight', 'Rent_price', 'Number_of_reviews', 'BMI']
for column in num_columns: 
    df[f"{column}_binned"] = pd.qcut(x = df[column], q = 10, labels=False, retbins=False, precision=3, duplicates='drop')

In [15]:
# This function creates the Neural Network model using Entity embedding. 
def create_model(data, catcols):
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    # NEED TO ADD SOMETHING FOR NUMERIC FEATURES HERE## 
    # Add numericals to the outputs as tensors 
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    y = layers.Dense(2, activation="softmax")(x)

    model = Model(inputs=inputs, outputs=y)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [16]:
# This function label encodes categorical features. It fits the training data with a model. 
# then uses the model to make predictions on the submissions data set. 
def neural_network_model(fold):
   
    #list of numeric columns 
    num_cols = ['Height', 'Age', 'Weight', 'Rent_price', 'Number_of_reviews', 'BMI']
    cat_cols = ['Type_of_Customer','Size','Overall_fit','Rented_for','Size_usually_worn','Bust_size','Body_type','Date','Brand','Sleeves','Neckline','Dress_Style']
    #All columns are features except the target column and the kfold column 
    features = ['Type_of_Customer', 'Size',  'Rented_for',
       'Size_usually_worn', 'Height', 'Age', 'Bust_size', 'Body_type',
       'Weight', 'Date', 'Brand', 'Rent_price', 'Number_of_reviews',
       'BMI', 'Sleeves', 'Neckline', 'Dress_Style', 
       'Age_binned', 'Weight_binned', 'Rent_price_binned',
       'Number_of_reviews_binned', 'BMI_binned']
    
    
    #convert the categorical variables to strings
    for col in features:
        df.loc[:,col] = df[col].astype(str)  
    #Use label encoder on categorical features. 
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:,feat]=lbl_enc.fit_transform(df[feat].values)
    
        
    #Hold one of the five folds as validation set and four folds as training sets 
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    model = create_model(df,features)
    
    # Features 
    xtrain = [
     df_train[features].values[:, k] for k in range(len(features))
    ]
    xvalid = [
    df_valid[features].values[:, k] for k in range(len(features))
    ]

    # get target columns
    ytrain = df_train.Rating.values
    yvalid = df_valid.Rating.values
    
    # convert target columns to categories

    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)
    
    # fit the model
    model.fit(xtrain,
    ytrain_cat,
    validation_data=(xvalid, yvalid_cat),
    verbose=0,
    batch_size=16,
    epochs=100)
    
    valid_preds = model.predict(xvalid)[:, 1]
    
    train_preds = model.predict(xtrain)[:,1]
    
    auc_valid = metrics.roc_auc_score(yvalid, valid_preds)
    auc_train = metrics.roc_auc_score(ytrain, train_preds)
    
    print(f"Fold = {fold}, AUC_train = {auc_train}, AUC_test = {auc_valid}")
    return ytrain, train_preds, yvalid, valid_preds


In [17]:
ytrain_3, train_preds_3, yvalid_3, valid_preds_3 = neural_network_model(3)

Fold = 3, AUC_train = 0.8650323707759539, AUC_test = 0.8128193921024311


In [18]:
df_train_results_3= pd.DataFrame(data=[ytrain_3, train_preds_3]).T
df_train_results_3.columns=["True_Train_Classes","Pred_Train_Classes"]

In [19]:
df_test_results_3= pd.DataFrame(data=[yvalid_3, valid_preds_3]).T
df_test_results_3.columns=["True_Test_Classes","Pred_Test_Classes"]

In [20]:
df_train_results_3['Pred_Train_Classes_binarized'] = np.where(df_train_results_3['Pred_Train_Classes'] >= 0.15, 1, 0)

In [21]:
df_test_results_3['Pred_Test_Classes_binarized'] = np.where(df_test_results_3['Pred_Test_Classes'] >= 0.15, 1, 0)

In [22]:
#Accuracy 
accuracy_train = metrics.accuracy_score(df_train_results_3.True_Train_Classes.values,
                                  df_train_results_3.Pred_Train_Classes_binarized.values)
accuracy_test = metrics.accuracy_score(df_test_results_3.True_Test_Classes.values,
                                       df_test_results_3.Pred_Test_Classes_binarized.values)

print(f"Training Set Accuracy: {accuracy_train}")
print(f"Validation Set Accuracy: {accuracy_test}")

Training Set Accuracy: 0.8705042869585368
Validation Set Accuracy: 0.847983123441795


In [23]:
#Recall 
recall_train = metrics.recall_score(df_train_results_3.True_Train_Classes.values,
                                  df_train_results_3.Pred_Train_Classes_binarized.values)
recall_test = metrics.recall_score(df_test_results_3.True_Test_Classes.values,
                                       df_test_results_3.Pred_Test_Classes_binarized.values)

print(f"Training Set Recall: {recall_train}")
print(f"Validation Set Recall: {recall_test}")

Training Set Recall: 0.5676218724584988
Validation Set Recall: 0.514736249296039


In [24]:
#Precision
precision_train = metrics.precision_score(df_train_results_3.True_Train_Classes.values,
                                  df_train_results_3.Pred_Train_Classes_binarized.values)
precision_test = metrics.precision_score(df_test_results_3.True_Test_Classes.values,
                                       df_test_results_3.Pred_Test_Classes_binarized.values)

print(f"Training Set Precision: {precision_train}")
print(f"Validation Set Precision: {precision_test}")

Training Set Precision: 0.6233909525560868
Validation Set Precision: 0.5581111337268472
