# Happy Customers Project

## Data Description:

Y = target attribute (Y) with values indicating 0 (unhappy) and 1 (happy) customers <br/>
X1 = my order was delivered on time <br/>
X2 = contents of my order was as I expected <br/>
X3 = I ordered everything I wanted to order <br/>
X4 = I paid a good price for my order <br/>
X5 = I am satisfied with my courier <br/>
X6 = the app makes ordering easy for me <br/>

Attributes X1 to X6 indicate the responses for each question and have values from 1 to 5 where the smaller number indicates less and the higher number indicates more towards the answer.

### Load Libraries

In [1]:
# useful libraries
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

# EDA
from pandas_profiling import ProfileReport

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# train test split
from sklearn.model_selection import train_test_split

# model, hyperparameter search and scoring
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score

### Load Dataset and EDA

In [2]:
# load dataset
data = pd.read_csv("Data/ACME-HappinessSurvey2020.csv")
print(data)

# perform EDA on the whole dataset to understand the data 
profile = ProfileReport(data)
profile.to_widgets()

     Y  X1  X2  X3  X4  X5  X6
0    0   3   3   3   4   2   4
1    0   3   2   3   5   4   3
2    1   5   3   3   3   3   5
3    0   5   4   3   3   3   5
4    0   5   4   3   3   3   5
..  ..  ..  ..  ..  ..  ..  ..
121  1   5   2   3   4   4   3
122  1   5   2   3   4   2   5
123  1   5   3   3   4   4   5
124  0   4   3   3   4   4   5
125  0   5   3   2   5   5   5

[126 rows x 7 columns]


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### Feature Selection

In [3]:
# create two dataset: one with the target variable and one with the features
Y = data.iloc[:, 0]
X = data.drop('Y', axis=1)

# define feature selection
k = 4
fs = SelectKBest(score_func=f_classif, k=k)

# apply feature selection
X_selected = fs.fit_transform(X, Y)
X_selected = pd.DataFrame(X_selected)

# print the pvalue of the features and the selected features
pvalues = pd.DataFrame(fs.pvalues_, list(X.columns))
pvalues.columns = ['p-value']
print(pvalues)

selectedFeatures = list(X.columns[ sorted(np.argsort(fs.pvalues_)[:k]) ])
print(f'\nTop {k} Selected Features: {selectedFeatures}') 

# take the indices of the selected features with: sorted(np.argsort(fs.pvalues_)[:-(k - 2)])

     p-value
X1  0.001486
X2  0.787313
X3  0.091807
X4  0.473623
X5  0.011488
X6  0.060568

Top 4 Selected Features: ['X1', 'X3', 'X5', 'X6']


### Hyperparameter tuning of our model

In [4]:
# split the dataset into train test
X_train, X_test, y_train, y_test = train_test_split(X[selectedFeatures], Y, test_size=0.2, random_state=25)

# 
def my_roc_auc_score(model, X, y): return roc_auc_score(y, model.predict_proba(X)[:,1])

# search space for our parameters

params = { 'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
           'subsample': np.arange(0.2, 0.5, 0.02),
           'min_child_weight': np.arange(1, 20, 2),
           'gamma': np.arange(1, 20, 2),
           'max_depth': np.arange(5, 30, 5),
           'colsample_bytree': np.arange(0.1, 0.5, 0.02),
           'n_estimators': [100, 200, 300, 400, 600]}

# create model
xgbr = XGBClassifier(use_label_encoder = False,
                     objective= 'binary:logistic',
                     tree_method = 'hist',
                     eval_metric = 'auc')

# create randomized search
clf = RandomizedSearchCV(estimator=xgbr,
                         param_distributions=params,
                         scoring = my_roc_auc_score,
                         n_iter=80,
                         verbose=1)

# fit model to find best hyperparameters
clf.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


RandomizedSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False,
                                           eval_metric='auc', gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monoton...
                                        'gamma': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'learning_rate': [0

### Results of hyperparameter tuning

In [5]:
df_cv_results = pd.DataFrame(clf.cv_results_)
df_cv_results = df_cv_results[["rank_test_score","mean_test_score",
                               "param_learning_rate", 
                               "param_max_depth", 
                               "param_n_estimators", 
                               "param_subsample", 
                               "param_colsample_bytree", 
                               "param_min_child_weight", 
                               "param_gamma"]]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results[:10]

Unnamed: 0,rank_test_score,mean_test_score,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,param_colsample_bytree,param_min_child_weight,param_gamma
71,1,0.728283,0.1,20,600,0.28,0.18,1,3
76,2,0.727273,0.05,5,300,0.26,0.48,1,7
78,3,0.719192,0.2,10,400,0.46,0.16,3,9
7,4,0.710101,0.05,5,300,0.42,0.4,3,7
12,5,0.707071,0.001,15,400,0.42,0.26,1,7
5,6,0.70202,0.01,10,200,0.44,0.1,1,7
21,7,0.685859,0.05,15,600,0.46,0.36,1,5
9,8,0.613131,0.2,5,300,0.4,0.48,1,13
0,9,0.5,0.01,10,100,0.22,0.14,17,3
57,9,0.5,0.1,15,400,0.24,0.26,5,11


### Final Model

In [9]:
model_xgboost_fin = XGBClassifier(objective= 'binary:logistic',

                                  learning_rate=0.1,
                                  max_depth=20,
                                  n_estimators=600,
                                  subsample=0.28,
                                  colsample_bytree=0.18,
                                  min_child_weight=1,
                                  gamma = 3,

                                  eval_metric='auc',
                                  tree_method = 'hist',
                                  verbosity=1,
                                  use_label_encoder=False)



model_xgboost_fin.fit(X_train,
                      y_train,
                      verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.18,
              enable_categorical=False, eval_metric='auc', gamma=3, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=20,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.28, tree_method='hist',
              use_label_encoder=False, validate_parameters=1, verbosity=1)

### Train and Test scores

In [10]:
y_train_pred = model_xgboost_fin.predict_proba(X_train)[:,1]
y_test_pred = model_xgboost_fin.predict_proba(X_test)[:,1]

print("AUC Train: {:.4f}\nAUC Test: {:.4f}".format(roc_auc_score(y_train, y_train_pred),
                                                    roc_auc_score(y_test, y_test_pred)))

AUC Train: 0.7725
AUC Test: 0.4018


### Feature Importance

In [8]:
df_var_imp = pd.DataFrame({"Variable": selectedFeatures,
                           "Importance": model_xgboost_fin.feature_importances_}) \
                        .sort_values(by='Importance', ascending=False)
df_var_imp

Unnamed: 0,Variable,Importance
0,X1,0.531002
1,X3,0.166068
2,X5,0.164227
3,X6,0.138703
