
# Predicting Customer Satisfaction on Rent the Runway

## III. Modeling (Neural Network) 
### Katrin Ayrapetov


<font style="font-size: 2rem; color: blue">


 
</font>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score, ConfusionMatrixDisplay
import warnings
warnings.filterwarnings('ignore')
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from sklearn import model_selection

In [2]:
from numpy.random import seed
seed(42)

In [3]:
#Import the Data Set 
df =  pd.read_csv('../Data/df_clean.csv')

In [4]:
#Binarize the Predictor Variable: Satisfaction Rating 
# 1: Not Satisfied with Rental (Satisfaction Rating of 1,2,3) 
# 0: Satisfied with Rental (Satisfaction Rating 4,5)
df['Rating'] = np.where(df['Rating'] <= 3, 1, 0)

In [5]:
#We are dealing with an unbalanced class. 
df["Rating"].value_counts(normalize=True)

0    0.833589
1    0.166411
Name: Rating, dtype: float64

In [6]:
#Create a target variable vector. 
y = df.Rating.values

In [7]:
#Because the target class is unbalanced, use the k-fold method to create 5 k-folds. 

#initiate the kfold class from model_selection module
#The folds are made by preserving the percentage of samples for each class.
kf = model_selection.StratifiedKFold(n_splits=5)

#Create a column to hold kfold labels 
df["kfold"] = -1

#Reshuffle the rows. 
df = df.sample(frac=1).reset_index(drop=True)

for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,"kfold"] = f

In [8]:
#Check that the same number of observations is in each fold. 
df.kfold.value_counts()

0    32266
1    32266
2    32266
3    32266
4    32265
Name: kfold, dtype: int64

In [9]:
#Check that the target variable has the same distribution  the same in each fold. 
for k in range(5):
    print(f"fold: k = {k}")
    print(df[df.kfold==k].Rating.value_counts(normalize=True))

fold: k = 0
0    0.835709
1    0.164291
Name: Rating, dtype: float64
fold: k = 1
0    0.830937
1    0.169063
Name: Rating, dtype: float64
fold: k = 2
0    0.834904
1    0.165096
Name: Rating, dtype: float64
fold: k = 3
0    0.832424
1    0.167576
Name: Rating, dtype: float64
fold: k = 4
0    0.833969
1    0.166031
Name: Rating, dtype: float64


In [10]:
#Drop Unnecessary Columns 
df.drop(columns=["Brand","Dress_Description","Details","Product_details","Retail_price"],inplace=True)
df.dropna(inplace=True)

In [11]:
#Convert the numeric columns to categories split along the 10 percentiles.
num_columns = ['Height', 'Age','Weight','Number_of_reviews','Rent_price']
for column in num_columns: 
    df[f"{column}_binned"] = pd.qcut(x = df[column], q = 10, labels=False, retbins=False, precision=3, duplicates='drop')

In [12]:
df.head()

Unnamed: 0,Label,Size,Overall_fit,Rented_for,Size_usually_worn,Height,Age,Bust_size,Body_type,Weight,...,Sleeves,Neckline,Dress_Style,BMI,kfold,Height_binned,Age_binned,Weight_binned,Number_of_reviews_binned,Rent_price_binned
0,TOP CONTRIBUTOR,10,Overall fit: True to Size,unknown,10,67,50,36C,athletic,185,...,short_sleeves,mock_neckline,hourglass,28.971931,0,5,8,9,3,3
1,TOP CONTRIBUTOR,0,Overall fit: True to Size,Party,0,63,32,32C,straightnarrow,105,...,long_sleeves,v_neckline,wrap,18.597884,0,1,2,0,4,0
2,TOP CONTRIBUTOR,M,unknown,Work,4,68,25,34D,hourglass,150,...,short_sleeves,high_neckline,hourglass,22.804931,0,6,0,6,1,0
3,unknown,unknown,Overall fit: Small,unknown,4,63,49,34C,petite,112,...,short_sleeves,v_neckline,sheath,19.837743,0,1,8,0,0,0
4,unknown,16,unknown,Party,14,63,53,38C,hourglass,180,...,short_sleeves,v_neckline,sheath,31.882086,0,1,9,9,6,4


In [29]:
# This function creates the Neural Network model using Entity embedding. 
def create_model(data, catcols):
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    x = layers.Concatenate()(outputs)
    ## Need to add six numeric features here and not sure how. ### 
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    y = layers.Dense(2, activation="softmax")(x)
    model = Model(inputs=inputs, outputs=y)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [30]:
# This function label encodes categorical features. It fits the training data with a model. 
# then uses the model to make predictions on the submissions data set. 
def neural_network_model(fold):
   
    #list of numeric columns 
    num_cols = ["Height","Age","Weight","Rent_price","Number_of_reviews","BMI"]
    #List of categorical columns 
    #cat_cols = 
    #All columns are features except the target column and the kfold column 
    features = [c for c in df.columns if c not in num_cols and c not in ("kfold", "Rating")]
    all_features = [c for c in df.columns if c not in num_cols and c not in ("kfold", "Rating")]
    
    #convert the categorical variables to strings
    for col in features:
        df.loc[:,col] = df[col].astype(str)  
    #Use label encoder on categorical features. 
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:,feat]=lbl_enc.fit_transform(df[feat].values)
    
    #Hold one of the five folds as validation set and four folds as training sets 
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    model = create_model(df,features)
    
    # Features 
    xtrain = [
     df_train[features].values[:, k] for k in range(len(features))
    ]
    xvalid = [
    df_valid[features].values[:, k] for k in range(len(features))
    ]

    # get target columns
    ytrain = df_train.Rating.values
    yvalid = df_valid.Rating.values
    
    # convert target columns to categories

    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)
    
    # fit the model
    model.fit(xtrain,
    ytrain_cat,
    validation_data=(xvalid, yvalid_cat),
    verbose=0,
    batch_size=32,
    epochs=100)
    
    valid_preds = model.predict(xvalid)[:, 1]
    auc = metrics.roc_auc_score(yvalid, valid_preds)
    
    
    #accuracy = metrics.accuracy_score(df_valid.Rating.values, model.predict(xvalid))
    #precision = metrics.precision_score(df_valid.Rating.values, model.predict(xvalid))
    #recall= metrics.recall_score(df_valid.Rating.values, model.predict(xvalid))
    #Ouput the Result
    print(f"Fold = {fold}, AUC = {auc}")


In [31]:
neural_network_model(0)

Fold = 0, AUC = 0.817506928447949
