# Happy Customers Project

## Data Description:

Y = target attribute (Y) with values indicating 0 (unhappy) and 1 (happy) customers <br/>
X1 = my order was delivered on time <br/>
X2 = contents of my order was as I expected <br/>
X3 = I ordered everything I wanted to order <br/>
X4 = I paid a good price for my order <br/>
X5 = I am satisfied with my courier <br/>
X6 = the app makes ordering easy for me <br/>

Attributes X1 to X6 indicate the responses for each question and have values from 1 to 5 where the smaller number indicates less and the higher number indicates more towards the answer.

### Load Libraries

In [1]:
# useful libraries
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np

# EDA
from pandas_profiling import ProfileReport

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# train test split
from sklearn.model_selection import train_test_split

# model, hyperparameter search and scoring
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, f1_score

### Load Dataset and EDA

In [2]:
# load dataset
data = pd.read_csv("Data/ACME-HappinessSurvey2020.csv")
print(data)

# perform EDA on the whole dataset to understand the data 
profile = ProfileReport(data)
profile.to_widgets()

     Y  X1  X2  X3  X4  X5  X6
0    0   3   3   3   4   2   4
1    0   3   2   3   5   4   3
2    1   5   3   3   3   3   5
3    0   5   4   3   3   3   5
4    0   5   4   3   3   3   5
..  ..  ..  ..  ..  ..  ..  ..
121  1   5   2   3   4   4   3
122  1   5   2   3   4   2   5
123  1   5   3   3   4   4   5
124  0   4   3   3   4   4   5
125  0   5   3   2   5   5   5

[126 rows x 7 columns]


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### Feature Selection

In [3]:
# create two dataset: one with the target variable and one with the features
Y = data.iloc[:, 0]
X = data.drop('Y', axis=1)

# define feature selection
k = 6
fs = SelectKBest(score_func=f_classif, k=k)

# apply feature selection
X_selected = fs.fit_transform(X, Y)
X_selected = pd.DataFrame(X_selected)

# print the pvalue of the features and the selected features
pvalues = pd.DataFrame(fs.pvalues_, list(X.columns))
pvalues.columns = ['p-value']
print(pvalues)

selectedFeatures = list(X.columns[ sorted(np.argsort(fs.pvalues_)[:k]) ])
print(f'\nTop {k} Selected Features: {selectedFeatures}') 

# take the indices of the selected features with: sorted(np.argsort(fs.pvalues_)[:-(k - 2)])

     p-value
X1  0.001486
X2  0.787313
X3  0.091807
X4  0.473623
X5  0.011488
X6  0.060568

Top 6 Selected Features: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6']


### Hyperparameter tuning of our model

In [37]:
# split the dataset into train test
X_train, X_test, y_train, y_test = train_test_split(X[selectedFeatures], Y, test_size=0.30, random_state=25)

# search space for our parameters

params = { 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.2, 0.5, 0.02),
           'min_child_weight': np.arange(1, 20, 3),
           'gamma': np.arange(1, 20, 1),
           'max_depth': np.arange(5, 30, 3),
           'colsample_bytree': np.arange(0.1, 0.5, 0.02),
           'n_estimators': [50, 100, 200, 300, 400]}

# create model
xgbr = XGBClassifier(objective= 'binary:logistic',
                     use_label_encoder = False,
                     verbosity = 0)

# create randomized search
clf = RandomizedSearchCV(estimator=xgbr,
                         param_distributions=params,
                         #scoring = 'accuracy',
                         n_iter=80,
                         verbose=1)

# fit model to find best hyperparameters
clf.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


RandomizedSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None...
       0.32, 0.34, 0.36, 0.38, 0.4 , 0.42, 0.44, 0.46, 0.48]),
                                        'gamma': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]

### Results of hyperparameter tuning

In [38]:
df_cv_results = pd.DataFrame(clf.cv_results_)
df_cv_results = df_cv_results[["rank_test_score","mean_test_score",
                               "param_learning_rate", 
                               "param_max_depth", 
                               "param_n_estimators", 
                               "param_subsample", 
                               "param_colsample_bytree", 
                               "param_min_child_weight", 
                               "param_gamma"]]
df_cv_results.sort_values(by='rank_test_score', inplace=True)
df_cv_results[:10]

Unnamed: 0,rank_test_score,mean_test_score,param_learning_rate,param_max_depth,param_n_estimators,param_subsample,param_colsample_bytree,param_min_child_weight,param_gamma
59,1,0.702614,0.1,29,300,0.44,0.36,1,5
8,2,0.669281,0.3,11,300,0.36,0.2,1,1
32,3,0.647059,0.1,17,300,0.44,0.3,1,7
13,4,0.601961,0.3,8,200,0.46,0.3,1,13
11,5,0.580392,0.2,29,300,0.32,0.18,1,9
31,6,0.568627,0.01,11,400,0.4,0.34,7,15
34,6,0.568627,0.3,23,400,0.2,0.32,1,6
37,6,0.568627,0.001,14,300,0.34,0.46,7,1
38,6,0.568627,0.3,29,300,0.38,0.26,7,10
78,6,0.568627,0.1,5,300,0.34,0.44,4,1


### Final Model

In [39]:
model_xgboost_fin = XGBClassifier(objective= 'binary:logistic',

                                  learning_rate=0.1,
                                  max_depth=29,
                                  n_estimators=300,
                                  subsample=0.44,
                                  colsample_bytree=0.36,
                                  min_child_weight=1,
                                  gamma = 5,

                                  verbosity=0,
                                  use_label_encoder=False)



model_xgboost_fin.fit(X_train,
                      y_train,
                      verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.36,
              enable_categorical=False, gamma=5, gpu_id=-1,
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=29,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=12, num_parallel_tree=1,
              predictor='auto', random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=0.44, tree_method='exact',
              use_label_encoder=False, validate_parameters=1, verbosity=0)

### Train and Test scores

In [40]:
def train_test_scores(model, mode, X, y):
    predictions = model.predict(X)

    cm = confusion_matrix(y, predictions)

    TN, FP, FN, TP = confusion_matrix(y, predictions).ravel()

    print('True Positive(TP)  = ', TP)
    print('False Positive(FP) = ', FP)
    print('True Negative(TN)  = ', TN)
    print('False Negative(FN) = ', FN)

    accuracy =  (TP+TN) /(TP+FP+TN+FN)

    print(f'{mode} Accuracy = {accuracy:.3f}\n')

train_test_scores(model_xgboost_fin, 'Train', X_train, y_train)
train_test_scores(model_xgboost_fin, 'Test', X_test, y_test)

[0 0 1 1 0 1 0 1 1 0 0 1 0 1 1 1 0 1 1 1 0 1 0 1 1 0 1 0 0 1 0 1 0 1 1 0 0
 0 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 1 0 0 1 1 1 0 1 0 1 1 0 0 1
 0 0 1 0 0 1 0 1 0 1 0 0 1 1]
True Positive(TP)  =  37
False Positive(FP) =  12
True Negative(TN)  =  26
False Negative(FN) =  13
Train Accuracy = 0.716

[1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 1 1 0 1 0 0 0 1 0 0 1 0 0 1
 0]
True Positive(TP)  =  8
False Positive(FP) =  8
True Negative(TN)  =  11
False Negative(FN) =  11
Test Accuracy = 0.500



### Feature Importance

In [8]:
df_var_imp = pd.DataFrame({"Variable": selectedFeatures,
                           "Importance": model_xgboost_fin.feature_importances_}) \
                        .sort_values(by='Importance', ascending=False)
df_var_imp

Unnamed: 0,Variable,Importance
0,X1,1.0
1,X2,0.0
2,X3,0.0
3,X4,0.0
4,X5,0.0
5,X6,0.0
