<img src="https://m1.quebecormedia.com/emp/emp/bender8175858f-7dbd-4927-85c5-85557e800b98_ORIGINAL.jpg?impolicy=crop-resize&x=0&y=0&w=1000&h=745&width=925&height=925" style="float: left; margin: 30px; height: 75px">

# Predicting Customer Satisfaction on Rent the Runway

## III. Modeling
### Katrin Ayrapetov


<font style="font-size: 2rem; color: blue">


 
</font>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, balanced_accuracy_score, ConfusionMatrixDisplay


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

In [4]:
from sklearn import preprocessing

In [5]:
from sklearn import model_selection

In [6]:
import itertools

In [7]:
df =  pd.read_csv('../Data/df_clean.csv')

In [8]:
#Binarize the Predictor Variable: Satisfaction Rating 
# 1: Not Satisfied with Rental (Satisfaction Rating of 1,2,3) 
# 0: Satisfied with Rental (Satisfaction Rating 4,5)
df['Rating'] = np.where(df['Rating'] <= 3, 1, 0)

In [9]:
#We are dealing with an unbalanced class. 
df["Rating"].value_counts(normalize=True)

0    0.833589
1    0.166411
Name: Rating, dtype: float64

In [10]:
#Create a target variable vector. 
y = df.Rating.values

In [11]:
#initiate the kfold class from model_selection module
#The folds are made by preserving the percentage of samples for each class.
kf = model_selection.StratifiedKFold(n_splits=5)

In [12]:
#Create a column to hold kfold labels 
df["kfold"] = -1

In [13]:
#Reshuffle the rows. 
df = df.sample(frac=1).reset_index(drop=True)

In [14]:
for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,"kfold"] = f

In [15]:
#Check that the same number of observations is in each fold. 
df.kfold.value_counts()

0    32266
1    32266
2    32266
3    32266
4    32265
Name: kfold, dtype: int64

In [16]:
#Check that the target variable has the same distribution  the same in each fold. 
for k in range(5):
    print(f"fold: k = {k}")
    print(df[df.kfold==k].Rating.value_counts(normalize=True))
    

fold: k = 0
0    0.833137
1    0.166863
Name: Rating, dtype: float64
fold: k = 1
0    0.83385
1    0.16615
Name: Rating, dtype: float64
fold: k = 2
0    0.831432
1    0.168568
Name: Rating, dtype: float64
fold: k = 3
0    0.834687
1    0.165313
Name: Rating, dtype: float64
fold: k = 4
0    0.834837
1    0.165163
Name: Rating, dtype: float64


In [17]:
#Create a set of features 
df.drop(columns=["Brand","Dress_Description"],inplace=True)

In [18]:
df.dropna(inplace=True)

In [19]:
df_train = df[df.kfold != 3].reset_index(drop=True)
df_valid = df[df.kfold == 3].reset_index(drop=True)

In [24]:
df_train.drop(columns=["kfold"],inplace=True)
df_valid.drop(columns=["kfold"],inplace=True)

In [26]:
#Export as an excel file 
df_train.to_csv('../Data/df_train_fashion.csv', header=True, index=False)
df_valid.to_csv('../Data/df_valid_fashion.csv', header=True, index=False)

In [25]:
df =  pd.read_csv('df_with_k_folds.csv')

In [42]:
# This function creates the Neural Network model using Entity embedding. 
def create_model(data, catcols):
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 60))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    y = layers.Dense(2, activation="softmax")(x)
    model = Model(inputs=inputs, outputs=y)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [39]:
"Height","Age","Weight","Retail_price","Rent_price","Number_of_reviews",

('Height', 'Age', 'Weight', 'Retail_price', 'Rent_price', 'Number_of_reviews')

In [40]:
# This function label encodes categorical features. It fits the training data with a model. 
# then uses the model to make predictions on the submissions data set. 
def neural_network_model(fold):
   
    #list of numeric columns 
    num_cols = ["Height","Age","Weight","Retail_price","Rent_price","Number_of_reviews"]
    #List of categorical columns 
    #cat_cols = [c for c in df.columns if c not in num_cols and c not in ("kfold", "Rating")]
    #All columns are features except the target column and the kfold column 
    features = [f for f in df.columns if f not in ("kfold", "Rating")]
    #[f for f in df.columns if f not in ('kfold',"Rating")]
    
    #convert the categorical variables to strings
    for col in features:
        df.loc[:,col] = df[col].astype(str)  
    #Use label encoder on categorical features. 
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:,feat]=lbl_enc.fit_transform(df[feat].values)
    
        
    #Hold one of the five folds as validation set and four folds as training sets 
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    model = create_model(df,features)
    
    # Features 
    xtrain = [
     df_train[features].values[:, k] for k in range(len(features))
    ]
    xvalid = [
    df_valid[features].values[:, k] for k in range(len(features))
    ]

    # get target columns
    ytrain = df_train.Rating.values
    yvalid = df_valid.Rating.values
    
    # convert target columns to categories

    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)
    
    # fit the model
    model.fit(xtrain,
    ytrain_cat,
    validation_data=(xvalid, yvalid_cat),
    verbose=0,
    batch_size=32,
    epochs=100)
    
    valid_preds = model.predict(xvalid)[:, 1]
    auc = metrics.roc_auc_score(yvalid, valid_preds)
    #accuracy = metrics.accuracy_score(df_valid.Rating.values, model.predict(xvalid))
    #precision = metrics.precision_score(df_valid.Rating.values, model.predict(xvalid))
    #recall= metrics.recall_score(df_valid.Rating.values, model.predict(xvalid))
    #Ouput the Result
    print(f"Fold = {fold}, AUC = {auc}")



In [41]:
neural_network_model(0)

Fold = 0, AUC = 0.8146743783699307
