# Title and Introduction TBD

<br>

## I. Preparation
***

- Install and import libraries

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from bayes_opt import BayesianOptimization

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier as rfc
import xgboost as xgb
from xgboost import XGBClassifier as xgbc

pd.set_option('display.max_columns', None)

- Read datasets

In [14]:
df = pd.read_csv("../Data/airplane_train_processed_normalized.csv")
df_val = pd.read_csv("../Data/airplane_test_processed_normalized.csv")


df = df.head(1000)
df_val = df_val.head(1000)


#Removing the highly correlated columns like 'cat__Gender_Male','cat__Customer Type_disloyal Customer',
#'cat__Type of Travel_Personal Travel','cat__Class_Eco',
df = df[['Gender_Female', 'Customer Type_Loyal Customer',
         'Type of Travel_Business travel',
         'Type of Travel_Personal Travel', 'Class_Business',
         'Class_Eco', 'Age',
         'Flight Distance', 'Departure Delay in Minutes',
         'Arrival Delay in Minutes', 'Inflight wifi service',
         'Departure/Arrival time convenient',
         'Ease of Online booking', 'Gate location',
         'Food and drink', 'Online boarding',
         'Seat comfort', 'Inflight entertainment',
         'On-board service', 'Leg room service',
         'Baggage handling', 'Checkin service',
         'Inflight service', 'Cleanliness',
         'satisfaction']]

df.dropna(inplace=True)

In [15]:
X_train = df[['Gender_Female', 'Customer Type_Loyal Customer',
              'Type of Travel_Business travel',
              'Type of Travel_Personal Travel', 'Class_Business',
              'Class_Eco', 'Age',
              'Flight Distance', 'Departure Delay in Minutes',
              'Arrival Delay in Minutes', 'Inflight wifi service',
              'Departure/Arrival time convenient',
              'Ease of Online booking', 'Gate location',
              'Food and drink', 'Online boarding',
              'Seat comfort', 'Inflight entertainment',
              'On-board service', 'Leg room service',
              'Baggage handling', 'Checkin service',
              'Inflight service', 'Cleanliness',]]

y_train = df['satisfaction'].values

X_test = df_val[['Gender_Female', 'Customer Type_Loyal Customer',
                 'Type of Travel_Business travel',
                 'Type of Travel_Personal Travel', 'Class_Business',
                 'Class_Eco', 'Age',
                 'Flight Distance', 'Departure Delay in Minutes',
                 'Arrival Delay in Minutes', 'Inflight wifi service',
                 'Departure/Arrival time convenient',
                 'Ease of Online booking', 'Gate location',
                 'Food and drink', 'Online boarding',
                 'Seat comfort', 'Inflight entertainment',
                 'On-board service', 'Leg room service',
                 'Baggage handling', 'Checkin service',
                 'Inflight service', 'Cleanliness']]

y_test = df_val['satisfaction'].values

unique_categories, category_counts = np.unique(y_train, return_counts=True)
np.unique(y_train, return_counts=True)

(array(['neutral or dissatisfied', 'satisfied'], dtype=object),
 array([561, 439]))

In [16]:
X_train.head()

Unnamed: 0,Gender_Female,Customer Type_Loyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Age,Flight Distance,Departure Delay in Minutes,Arrival Delay in Minutes,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness
0,0.0,1.0,0.0,1.0,0.0,0.0,-1.746468,6.133398,3.258097,2.944439,3.0,4.0,3.0,1.0,5.0,3.0,5.0,5.0,4.0,3.0,4.0,4.0,5.0,5.0
1,0.0,0.0,1.0,0.0,1.0,0.0,-0.951927,5.463832,0.693147,1.94591,3.0,2.0,3.0,3.0,1.0,3.0,1.0,1.0,1.0,5.0,3.0,1.0,4.0,1.0
2,1.0,1.0,1.0,0.0,1.0,0.0,-0.885715,7.041412,0.0,0.0,2.0,2.0,2.0,2.0,5.0,5.0,5.0,5.0,4.0,3.0,4.0,4.0,4.0,5.0
3,1.0,1.0,1.0,0.0,1.0,0.0,-0.951927,6.33328,2.484907,2.302585,2.0,5.0,5.0,5.0,2.0,2.0,2.0,2.0,2.0,5.0,3.0,1.0,4.0,2.0
4,0.0,1.0,1.0,0.0,1.0,0.0,1.431697,5.370638,0.0,0.0,3.0,3.0,3.0,3.0,4.0,5.0,5.0,3.0,3.0,4.0,4.0,3.0,3.0,3.0


<br>

- Set random seed

In [17]:
SEED = 42

<br>

## II. Bayesian Optimization on Random Forest Classifier
***

- Define the function to be optimized

In [18]:
# Define rfc_cv as Random Forest Classifier returning Cross Validation Score
def rfc_cv(n_estimators, min_samples_split, max_features, data, targets):
    estimator = rfc(n_estimators = n_estimators,
                    min_samples_split = min_samples_split,
                    max_features = max_features,
                    random_state = SEED)
    cval = cross_val_score(estimator, data, targets, scoring = 'accuracy', cv = 4)
    return cval.mean()

- Define parameter bounds

In [19]:
pbounds = {'n_estimators': (10, 250), 
           'min_samples_split': (2, 25), 
           'max_features': (0.1, 0.999)}

- Create an optimization function for random forest classifier

In [33]:
# Apply Bayesian Optimization on Random Forest parameters
def optimize_rfc(data, targets):
    # To ensure data type (float -> int) for rfc_cv inputs
    def rfc_crossval(n_estimators, min_samples_split, max_features):
        return rfc_cv(n_estimators = int(n_estimators),
                      min_samples_split = int(min_samples_split),
                      max_features = max(min(max_features, 0.999), 1e-3),
                      data=data,
                      targets=targets)
    
    optimizer = BayesianOptimization(f = rfc_crossval,
                                     pbounds = pbounds,
                                     random_state = SEED,
                                     verbose=2)
    optimizer.maximize(n_iter = 10, init_points = 2)
    
    # Return the best result, and results from each iteration
    return {"max": optimizer.max, "res": optimizer.res}

In [34]:
opt = optimize_rfc(X_train, y_train)

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m1        [0m | [0m0.898    [0m | [0m0.4367   [0m | [0m23.87    [0m | [0m185.7    [0m |
| [95m2        [0m | [95m0.904    [0m | [95m0.6382   [0m | [95m5.588    [0m | [95m47.44    [0m |
| [95m3        [0m | [95m0.914    [0m | [95m0.2159   [0m | [95m4.945    [0m | [95m46.46    [0m |
| [0m4        [0m | [0m0.904    [0m | [0m0.2445   [0m | [0m11.18    [0m | [0m30.76    [0m |
| [0m5        [0m | [0m0.907    [0m | [0m0.3924   [0m | [0m13.02    [0m | [0m44.27    [0m |
| [0m6        [0m | [0m0.903    [0m | [0m0.9477   [0m | [0m10.46    [0m | [0m63.11    [0m |
| [0m7        [0m | [0m0.892    [0m | [0m0.9038   [0m | [0m21.81    [0m | [0m109.2    [0m |
| [0m8        [0m | [0m0.91     [0m | [0m0.7293   [0m | [0m5.515    [0m | [0m46.05    [0m |
| [0m9        [0m | [0m0.914    [0m | [0m0.19

In [35]:
print(opt["max"])

{'target': 0.914, 'params': {'max_features': 0.21586286999767368, 'min_samples_split': 4.945054296028767, 'n_estimators': 46.45664644295064}}


In [36]:
for i, res in enumerate(opt["res"]):
    print("\nIteration "+str(i)+":")
    print(res)


Iteration 0:
{'target': 0.898, 'params': {'max_features': 0.4367115668437789, 'min_samples_split': 23.86642904742807, 'n_estimators': 185.67854603473722}}

Iteration 1:
{'target': 0.904, 'params': {'max_features': 0.6381939772931359, 'min_samples_split': 5.58842873017604, 'n_estimators': 47.43868488068863}}

Iteration 2:
{'target': 0.914, 'params': {'max_features': 0.21586286999767368, 'min_samples_split': 4.945054296028767, 'n_estimators': 46.45664644295064}}

Iteration 3:
{'target': 0.904, 'params': {'max_features': 0.2445451182958078, 'min_samples_split': 11.17820526929906, 'n_estimators': 30.763631739876622}}

Iteration 4:
{'target': 0.907, 'params': {'max_features': 0.39238387829300225, 'min_samples_split': 13.019476108193311, 'n_estimators': 44.268769365226845}}

Iteration 5:
{'target': 0.903, 'params': {'max_features': 0.947749331004173, 'min_samples_split': 10.461921704957016, 'n_estimators': 63.110962309161955}}

Iteration 6:
{'target': 0.892, 'params': {'max_features': 0.903

<br>

## III. Bayesian Optimization on XGB Classifier
***

- Define the function to be optimized