# Hyperparameter Optimization - HyperOpt
### Table of Content
I. [**Data Preparation**](#i)<br>
II. [**HyperOpt**](#ii)<br>
<a id = 'top'>

Reference: 
- <a href='https://optuna.readthedocs.io/en/stable/index.html'>Optuna Documentation</a>
- <a href='https://www.kaggle.com/code/hamzaghanmi/xgboost-catboost-using-optuna'>XGBoost & Catboost Using Optuna</a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
#from sklearn.ensemble import RandomForestClassifier
#from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
#from sklearn.metrics import accuracy_score, classification_report, f1_score, mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
df_train = pd.read_csv('../data/Preprocessed_data_with_date/airplane_train_processed_date.csv')
df_val = pd.read_csv('../data/Preprocessed_data_with_date/airplane_test_processed_date.csv')

In [3]:
features = ['Gender_Female', 'Customer Type_Loyal Customer',
       'Type of Travel_Business travel',
       'Type of Travel_Personal Travel', 'Class_Business',
       'Class_Eco', 'Age',
       'Flight Distance', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes', 'Inflight wifi service',
       'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location',
       'Food and drink', 'Online boarding',
       'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service',
       'Inflight service', 'Cleanliness']

le = LabelEncoder()
X_train = df_train[features]
y_train = df_train['satisfaction']
y_train = le.fit_transform(y_train)

X_val = df_val[features]
y_val = df_val['satisfaction']
y_val = le.fit_transform(y_val)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((102825, 24), (25976, 24), (102825,), (25976,))

In [4]:
def objective(trial,data=X_train,target=y_train):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.3,random_state=15)
    param = {
        #'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.012,0.016,0.02]),
        'n_estimators': 1000, #as original model
        'max_depth': trial.suggest_categorical('max_depth', [5,10,15,20,25,30,37,40]),
        'random_state': 15,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    accuracy = accuracy_score(test_y, preds)
    
    return accuracy

In [5]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[32m[I 2023-04-20 14:22:03,340][0m A new study created in memory with name: no-name-70696540-136b-477b-a3f9-e2b59f92bd75[0m




[32m[I 2023-04-20 14:25:16,642][0m Trial 0 finished with value: 0.9487487033195021 and parameters: {'lambda': 0.005430671354336585, 'alpha': 0.05047073135177418, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.012, 'max_depth': 25, 'min_child_weight': 93}. Best is trial 0 with value: 0.9487487033195021.[0m




[32m[I 2023-04-20 14:26:33,644][0m Trial 1 finished with value: 0.9373379149377593 and parameters: {'lambda': 0.03583365124833772, 'alpha': 0.196233005177544, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.016, 'max_depth': 15, 'min_child_weight': 166}. Best is trial 0 with value: 0.9487487033195021.[0m




[32m[I 2023-04-20 14:28:46,275][0m Trial 2 finished with value: 0.9455070020746889 and parameters: {'lambda': 2.0100144842215593, 'alpha': 0.04228051343481473, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 37, 'min_child_weight': 200}. Best is trial 0 with value: 0.9487487033195021.[0m




[32m[I 2023-04-20 14:31:58,252][0m Trial 3 finished with value: 0.9556535269709544 and parameters: {'lambda': 0.08007361760862627, 'alpha': 0.013738294998239114, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.016, 'max_depth': 15, 'min_child_weight': 58}. Best is trial 3 with value: 0.9556535269709544.[0m




[32m[I 2023-04-20 14:35:01,339][0m Trial 4 finished with value: 0.9573392116182573 and parameters: {'lambda': 0.3269773458932101, 'alpha': 0.36035825085585776, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.016, 'max_depth': 30, 'min_child_weight': 40}. Best is trial 4 with value: 0.9573392116182573.[0m




[32m[I 2023-04-20 14:36:36,623][0m Trial 5 finished with value: 0.9479706950207469 and parameters: {'lambda': 0.011207405714225271, 'alpha': 0.6225985962297583, 'colsample_bytree': 0.6, 'subsample': 1.0, 'learning_rate': 0.016, 'max_depth': 20, 'min_child_weight': 186}. Best is trial 4 with value: 0.9573392116182573.[0m




[32m[I 2023-04-20 14:38:29,435][0m Trial 6 finished with value: 0.9347121369294605 and parameters: {'lambda': 0.0051352578586704315, 'alpha': 0.006042793046879364, 'colsample_bytree': 0.9, 'subsample': 0.5, 'learning_rate': 0.008, 'max_depth': 25, 'min_child_weight': 157}. Best is trial 4 with value: 0.9573392116182573.[0m




[32m[I 2023-04-20 14:39:54,357][0m Trial 7 finished with value: 0.9252139522821576 and parameters: {'lambda': 0.014205250234180705, 'alpha': 1.0204267513990612, 'colsample_bytree': 0.9, 'subsample': 0.4, 'learning_rate': 0.008, 'max_depth': 20, 'min_child_weight': 264}. Best is trial 4 with value: 0.9573392116182573.[0m




[32m[I 2023-04-20 14:41:18,541][0m Trial 8 finished with value: 0.9299144190871369 and parameters: {'lambda': 0.0033197610377063993, 'alpha': 0.016673004087902836, 'colsample_bytree': 0.8, 'subsample': 0.4, 'learning_rate': 0.012, 'max_depth': 37, 'min_child_weight': 232}. Best is trial 4 with value: 0.9573392116182573.[0m




[32m[I 2023-04-20 14:43:27,453][0m Trial 9 finished with value: 0.9355873962655602 and parameters: {'lambda': 0.001738900234829301, 'alpha': 0.005850845267000803, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.008, 'max_depth': 37, 'min_child_weight': 152}. Best is trial 4 with value: 0.9573392116182573.[0m




[32m[I 2023-04-20 14:46:53,630][0m Trial 10 finished with value: 0.9605484958506224 and parameters: {'lambda': 0.526992210668216, 'alpha': 6.455982645041117, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 1}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 14:50:00,320][0m Trial 11 finished with value: 0.9605160788381742 and parameters: {'lambda': 0.41835473478675694, 'alpha': 8.41347475150322, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 2}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 14:52:56,007][0m Trial 12 finished with value: 0.9592842323651453 and parameters: {'lambda': 0.39979135336619315, 'alpha': 9.658802377904474, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 6}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 14:54:42,092][0m Trial 13 finished with value: 0.9495915456431535 and parameters: {'lambda': 7.997512001934952, 'alpha': 8.443862594846507, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 40, 'min_child_weight': 98}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 14:58:09,539][0m Trial 14 finished with value: 0.9600946576763485 and parameters: {'lambda': 0.3397552480091733, 'alpha': 2.7024025101072793, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 7}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 14:59:57,517][0m Trial 15 finished with value: 0.952638744813278 and parameters: {'lambda': 1.2664185597819777, 'alpha': 2.2328338907834424, 'colsample_bytree': 0.7, 'subsample': 0.8, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_weight': 99}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:01:05,308][0m Trial 16 finished with value: 0.9542271784232366 and parameters: {'lambda': 0.10047382246102932, 'alpha': 2.8328965165440954, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 5, 'min_child_weight': 39}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:02:47,112][0m Trial 17 finished with value: 0.9530601659751037 and parameters: {'lambda': 0.9057301711130104, 'alpha': 0.0014425180469830538, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 68}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:04:11,691][0m Trial 18 finished with value: 0.9416169605809128 and parameters: {'lambda': 0.12373697489712274, 'alpha': 8.45212557933116, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 296}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:05:14,615][0m Trial 19 finished with value: 0.9489432053941909 and parameters: {'lambda': 3.602899892371162, 'alpha': 1.1767720073382437, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.02, 'max_depth': 40, 'min_child_weight': 122}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:07:15,199][0m Trial 20 finished with value: 0.960224325726141 and parameters: {'lambda': 0.659126824912145, 'alpha': 0.31279700373735314, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_weight': 2}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:09:04,085][0m Trial 21 finished with value: 0.960191908713693 and parameters: {'lambda': 0.667349881664988, 'alpha': 0.17686275962655978, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_weight': 2}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:10:35,146][0m Trial 22 finished with value: 0.9567557053941909 and parameters: {'lambda': 2.309606863697893, 'alpha': 4.41203413792645, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_weight': 33}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:11:46,413][0m Trial 23 finished with value: 0.9549403526970954 and parameters: {'lambda': 0.24196685992328706, 'alpha': 1.4419444774111132, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 5, 'min_child_weight': 28}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:13:23,081][0m Trial 24 finished with value: 0.9528656639004149 and parameters: {'lambda': 0.6395697263861618, 'alpha': 0.48638920122124296, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_weight': 69}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:15:13,891][0m Trial 25 finished with value: 0.9576309647302904 and parameters: {'lambda': 1.2979984542590128, 'alpha': 4.805560852033028, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 24}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:16:29,648][0m Trial 26 finished with value: 0.9478086099585062 and parameters: {'lambda': 0.20488376182406468, 'alpha': 4.209782409685503, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 126}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:18:03,453][0m Trial 27 finished with value: 0.9526063278008299 and parameters: {'lambda': 0.5331908448512528, 'alpha': 0.8194288066727621, 'colsample_bytree': 0.6, 'subsample': 0.7, 'learning_rate': 0.008, 'max_depth': 10, 'min_child_weight': 56}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:20:57,530][0m Trial 28 finished with value: 0.9585386410788381 and parameters: {'lambda': 0.8477834167718901, 'alpha': 1.5504140886534383, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.012, 'max_depth': 30, 'min_child_weight': 18}. Best is trial 10 with value: 0.9605484958506224.[0m




[32m[I 2023-04-20 15:22:22,685][0m Trial 29 finished with value: 0.9457663381742739 and parameters: {'lambda': 0.1653243665254985, 'alpha': 0.30303675681969683, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 25, 'min_child_weight': 81}. Best is trial 10 with value: 0.9605484958506224.[0m


In [6]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,0.948749,2023-04-20 14:22:03.343008,2023-04-20 14:25:16.638427,0 days 00:03:13.295419,0.050471,0.5,0.005431,0.012,25,93,0.7,COMPLETE
1,1,0.937338,2023-04-20 14:25:16.651269,2023-04-20 14:26:33.643578,0 days 00:01:16.992309,0.196233,0.9,0.035834,0.016,15,166,0.4,COMPLETE
2,2,0.945507,2023-04-20 14:26:33.645591,2023-04-20 14:28:46.274849,0 days 00:02:12.629258,0.042281,0.9,2.010014,0.012,37,200,1.0,COMPLETE
3,3,0.955654,2023-04-20 14:28:46.276662,2023-04-20 14:31:58.251962,0 days 00:03:11.975300,0.013738,0.9,0.080074,0.016,15,58,1.0,COMPLETE
4,4,0.957339,2023-04-20 14:31:58.254516,2023-04-20 14:35:01.339115,0 days 00:03:03.084599,0.360358,0.8,0.326977,0.016,30,40,1.0,COMPLETE
5,5,0.947971,2023-04-20 14:35:01.341270,2023-04-20 14:36:36.622811,0 days 00:01:35.281541,0.622599,0.6,0.011207,0.016,20,186,1.0,COMPLETE
6,6,0.934712,2023-04-20 14:36:36.625353,2023-04-20 14:38:29.434250,0 days 00:01:52.808897,0.006043,0.9,0.005135,0.008,25,157,0.5,COMPLETE
7,7,0.925214,2023-04-20 14:38:29.436953,2023-04-20 14:39:54.356325,0 days 00:01:24.919372,1.020427,0.9,0.014205,0.008,20,264,0.4,COMPLETE
8,8,0.929914,2023-04-20 14:39:54.359036,2023-04-20 14:41:18.540580,0 days 00:01:24.181544,0.016673,0.8,0.00332,0.012,37,232,0.4,COMPLETE
9,9,0.935587,2023-04-20 14:41:18.542402,2023-04-20 14:43:27.452631,0 days 00:02:08.910229,0.005851,1.0,0.001739,0.008,37,152,0.5,COMPLETE


In [7]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best MSE:', study.best_value)

Number of finished trials: 30
Best trial: {'lambda': 0.526992210668216, 'alpha': 6.455982645041117, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 1}
Best MSE: 0.9605484958506224


In [10]:
final_model = xgb.XGBClassifier(**study.best_trial.params)
final_model.fit(X_train, y_train)
y_val_pred = final_model.predict(X_val)
print("Accuracy on validation set is {}%".format(accuracy_score(y_val, y_val_pred)*100))

Accuracy on validation set is 94.82984293193716%
