# Title and Introduction TBD

<br>

## I. Preparation
***

- Install and import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bayes_opt import BayesianOptimization

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.model_selection import cross_val_score

import xgboost as xgb
from xgboost import XGBClassifier as xgbc, cv

import warnings

# Show all columns
pd.set_option('display.max_columns', None)

# Ignore all warnings
warnings.filterwarnings("ignore")

- Read datasets

In [2]:
df = pd.read_csv("../Data/airplane_train_processed_normalized.csv")
df_val = pd.read_csv("../Data/airplane_test_processed_normalized.csv")


df = df.head(1000)
df_val = df_val.head(1000)


#Removing the highly correlated columns like 'cat__Gender_Male','cat__Customer Type_disloyal Customer',
#'cat__Type of Travel_Personal Travel','cat__Class_Eco',
df = df[['Gender_Female', 'Customer Type_Loyal Customer',
         'Type of Travel_Business travel',
         'Type of Travel_Personal Travel', 'Class_Business',
         'Class_Eco', 'Age',
         'Flight Distance', 'Departure Delay in Minutes',
         'Arrival Delay in Minutes', 'Inflight wifi service',
         'Departure/Arrival time convenient',
         'Ease of Online booking', 'Gate location',
         'Food and drink', 'Online boarding',
         'Seat comfort', 'Inflight entertainment',
         'On-board service', 'Leg room service',
         'Baggage handling', 'Checkin service',
         'Inflight service', 'Cleanliness',
         'satisfaction']]

df.dropna(inplace=True)

In [3]:
X_train = df[['Gender_Female', 'Customer Type_Loyal Customer',
              'Type of Travel_Business travel',
              'Type of Travel_Personal Travel', 'Class_Business',
              'Class_Eco', 'Age',
              'Flight Distance', 'Departure Delay in Minutes',
              'Arrival Delay in Minutes', 'Inflight wifi service',
              'Departure/Arrival time convenient',
              'Ease of Online booking', 'Gate location',
              'Food and drink', 'Online boarding',
              'Seat comfort', 'Inflight entertainment',
              'On-board service', 'Leg room service',
              'Baggage handling', 'Checkin service',
              'Inflight service', 'Cleanliness',]]

y_train = df['satisfaction'].values

X_test = df_val[['Gender_Female', 'Customer Type_Loyal Customer',
                 'Type of Travel_Business travel',
                 'Type of Travel_Personal Travel', 'Class_Business',
                 'Class_Eco', 'Age',
                 'Flight Distance', 'Departure Delay in Minutes',
                 'Arrival Delay in Minutes', 'Inflight wifi service',
                 'Departure/Arrival time convenient',
                 'Ease of Online booking', 'Gate location',
                 'Food and drink', 'Online boarding',
                 'Seat comfort', 'Inflight entertainment',
                 'On-board service', 'Leg room service',
                 'Baggage handling', 'Checkin service',
                 'Inflight service', 'Cleanliness']]

y_test = df_val['satisfaction'].values

unique_categories, category_counts = np.unique(y_train, return_counts=True)
np.unique(y_train, return_counts=True)

(array(['neutral or dissatisfied', 'satisfied'], dtype=object),
 array([561, 439]))

In [4]:
X_train.head()

Unnamed: 0,Gender_Female,Customer Type_Loyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
0,0.0,1.0,0.0,1.0,0.0,0.0,-1.746468,6.133398,3.258097,2.944439,3.0,4.0,3.0,1.0,5.0,3.0,5.0,5.0,4.0,3.0,4.0,4.0,5.0,5.0
1,0.0,0.0,1.0,0.0,1.0,0.0,-0.951927,5.463832,0.693147,1.94591,3.0,2.0,3.0,3.0,1.0,3.0,1.0,1.0,1.0,5.0,3.0,1.0,4.0,1.0
2,1.0,1.0,1.0,0.0,1.0,0.0,-0.885715,7.041412,0.0,0.0,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,4.0,3.0,4.0,4.0,4.0,5.0
3,1.0,1.0,1.0,0.0,1.0,0.0,-0.951927,6.33328,2.484907,2.302585,2.0,5.0,5.0,5.0,2.0,2.0,2.0,2.0,2.0,5.0,3.0,1.0,4.0,2.0
4,0.0,1.0,1.0,0.0,1.0,0.0,1.431697,5.370638,0.0,0.0,3.0,3.0,3.0,3.0,4.0,5.0,5.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0


<br>

- Set random seed and the number of folds

In [5]:
SEED = 42
CV = 4

<br>

## II. Bayesian Optimization on Random Forest Classifier
***

- Define the function to be optimized

In [64]:
# Define rfc_cv as Random Forest Classifier returning Cross Validation Score
def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    estimator = rfc(n_estimators = n_estimators,
                    min_samples_split = min_samples_split,
                    max_features = max_features,
                    random_state = SEED)
    cval = cross_val_score(estimator, data, targets, scoring = 'accuracy', cv = CV)
    return cval.mean()

- Define parameter bounds

In [65]:
pbounds = {'max_features': (0.1, 0.999),
           'min_samples_split': (2, 25),
           'n_estimators': (10, 250)}

- Create an optimization function for random forest classifier

In [66]:
# Apply Bayesian Optimization on Random Forest parameters
def optimize_rfc(data, targets):
    # To ensure data type (float -> int) for rfc_cv inputs
    def rfc_crossval(n_estimators, min_samples_split, max_features):
        return rfc_cv(n_estimators = int(n_estimators),
                      min_samples_split = int(min_samples_split),
                      max_features = max(min(max_features, 0.999), 1e-3),
                      data = data,
                      targets = targets)
    
    optimizer = BayesianOptimization(f = rfc_crossval,
                                     pbounds = pbounds,
                                     random_state = SEED,
                                     verbose = 2)
    optimizer.maximize(n_iter = 10, init_points = 2)
    
    # Return the best result, and results from each iteration
    return {"max": optimizer.max, "res": optimizer.res}

In [67]:
opt_rfc = optimize_rfc(X_train, y_train)

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m1        [0m | [0m0.898    [0m | [0m0.4367   [0m | [0m23.87    [0m | [0m185.7    [0m |
| [95m2        [0m | [95m0.904    [0m | [95m0.6382   [0m | [95m5.588    [0m | [95m47.44    [0m |
| [95m3        [0m | [95m0.914    [0m | [95m0.2159   [0m | [95m4.945    [0m | [95m46.46    [0m |
| [0m4        [0m | [0m0.904    [0m | [0m0.2445   [0m | [0m11.18    [0m | [0m30.76    [0m |
| [0m5        [0m | [0m0.907    [0m | [0m0.3924   [0m | [0m13.02    [0m | [0m44.27    [0m |
| [0m6        [0m | [0m0.903    [0m | [0m0.9477   [0m | [0m10.46    [0m | [0m63.11    [0m |
| [0m7        [0m | [0m0.892    [0m | [0m0.9038   [0m | [0m21.81    [0m | [0m109.2    [0m |
| [0m8        [0m | [0m0.91     [0m | [0m0.7293   [0m | [0m5.515    [0m | [0m46.05    [0m |
| [0m9        [0m | [0m0.914    [0m | [0m0.19

In [68]:
print(opt_rfc["max"])

{'target': 0.914, 'params': {'max_features': 0.21586286999767368, 'min_samples_split': 4.945054296028767, 'n_estimators': 46.45664644295064}}


In [70]:
iterations = {}

cols = ["iteration", "target"]
cols.extend(list(pbounds.keys()))

for col in cols:
    iterations[col] = []

iterations = pd.DataFrame(iterations)

for i, res in enumerate(opt_rfc["res"]):
    iterations.loc[i] = [str(i), 
                         res["target"], 
                         max(min(res["params"]["max_features"], 0.999), 1e-3),
                         round(res["params"]["min_samples_split"]), 
                         round(res["params"]["n_estimators"])]
    
display(iterations)

Unnamed: 0,iteration,target,max_features,min_samples_split,n_estimators
0,0,0.898,0.436712,24,186
1,1,0.904,0.638194,6,47
2,2,0.914,0.215863,5,46
3,3,0.904,0.244545,11,31
4,4,0.907,0.392384,13,44
5,5,0.903,0.947749,10,63
6,6,0.892,0.903793,22,109
7,7,0.91,0.729259,6,46
8,8,0.914,0.194518,4,46
9,9,0.913,0.154141,5,44


<br>

## III. Bayesian Optimization on XGB Classifier
***

- Define the function to be optimized

In [71]:
# Define xgb_cv as XGB Classifier returning Cross Validation Score
def xgb_cv(max_depth, alpha, learning_rate, data, targets):
    le = LabelEncoder()
    target_ = le.fit_transform(targets)
    data_dmatrix = xgb.DMatrix(data = data, label = target_)

    params = {'objective':'binary:logistic',
              'max_depth': max_depth,
              'alpha': alpha,
              'learning_rate': learning_rate}

    xgb_cv_score = cv(dtrain = data_dmatrix, 
                      params = params, 
                      nfold = CV,
                      num_boost_round = 50, 
                      early_stopping_rounds = 10, 
                      metrics = "auc", 
                      as_pandas = True, 
                      seed = SEED)

    return xgb_cv_score["test-auc-mean"].mean()

- Define parameter bounds

In [72]:
pbounds = {'alpha': (1, 180), 
           'learning_rate': (0.01, 1.0), 
           'max_depth': (10, 250)}

- Create an optimization function for XGB classifier

In [73]:
# Apply Bayesian Optimization on Random Forest parameters
def optimize_xgb(data, targets):
    # To ensure data type (float -> int) for xgb_cv inputs
    def xgb_crossval(max_depth, alpha, learning_rate):
        return xgb_cv(max_depth = int(max_depth),
                      alpha = int(alpha),
                      learning_rate = learning_rate,
                      data = data,
                      targets = targets)
    
    optimizer = BayesianOptimization(f = xgb_crossval,
                                     pbounds = pbounds,
                                     random_state = SEED,
                                     verbose = 2)
    
    optimizer.maximize(n_iter = 10, init_points = 2)
    
    # Return the best result, and results from each iteration
    return {"max": optimizer.max, "res": optimizer.res}

In [74]:
opt_xgb = optimize_xgb(X_train, y_train)

|   iter    |  target   |   alpha   | learni... | max_depth |
-------------------------------------------------------------
| [0m1        [0m | [0m0.8872   [0m | [0m68.04    [0m | [0m0.9512   [0m | [0m185.7    [0m |
| [0m2        [0m | [0m0.8211   [0m | [0m108.2    [0m | [0m0.1645   [0m | [0m47.44    [0m |
| [95m3        [0m | [95m0.8992   [0m | [95m67.55    [0m | [95m0.2517   [0m | [95m185.0    [0m |
| [0m4        [0m | [0m0.8747   [0m | [0m64.6     [0m | [0m0.01     [0m | [0m180.6    [0m |
| [0m5        [0m | [0m0.8924   [0m | [0m69.52    [0m | [0m0.6006   [0m | [0m181.4    [0m |
| [95m6        [0m | [95m0.9016   [0m | [95m61.57    [0m | [95m0.2831   [0m | [95m187.7    [0m |
| [95m7        [0m | [95m0.9062   [0m | [95m57.04    [0m | [95m0.3135   [0m | [95m189.2    [0m |
| [0m8        [0m | [0m0.9017   [0m | [0m59.9     [0m | [0m0.4798   [0m | [0m193.5    [0m |
| [95m9        [0m | [95m0.9139   [0m | 

In [75]:
print(opt_xgb["max"])

{'target': 0.9190325048697815, 'params': {'alpha': 45.967576791022296, 'learning_rate': 0.14712842812756818, 'max_depth': 193.63090589918684}}


In [78]:
iterations = {}

cols = ["iteration", "target"]
cols.extend(list(pbounds.keys()))

for col in cols:
    iterations[col] = []

iterations = pd.DataFrame(iterations)

for i, res in enumerate(opt_xgb["res"]):
    iterations.loc[i] = [str(i), 
                         res["target"], 
                         round(res["params"]["alpha"]), 
                         res["params"]["learning_rate"], 
                         round(int(res["params"]["max_depth"]))]
    
display(iterations)

Unnamed: 0,iteration,target,alpha,learning_rate,max_depth
0,0,0.887209,68,0.951207,185
1,1,0.821119,108,0.164458,47
2,2,0.899234,68,0.251654,184
3,3,0.874678,65,0.01,180
4,4,0.892378,70,0.600593,181
5,5,0.901632,62,0.2831,187
6,6,0.906223,57,0.313514,189
7,7,0.901655,60,0.479798,193
8,8,0.913858,52,0.292192,194
9,9,0.913653,51,0.6984,189
