In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.metrics import accuracy_score, classification_report, f1_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [2]:
df_train = pd.read_csv('../data/Preprocessed_data_with_date/airplane_train_processed_date.csv')
df_val = pd.read_csv('../data/Preprocessed_data_with_date/airplane_test_processed_date.csv')

In [3]:
features = ['Gender_Female', 'Customer Type_Loyal Customer',
       'Type of Travel_Business travel',
       'Type of Travel_Personal Travel', 'Class_Business',
       'Class_Eco', 'Age',
       'Flight Distance', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes', 'Inflight wifi service',
       'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location',
       'Food and drink', 'Online boarding',
       'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service',
       'Inflight service', 'Cleanliness']

le = LabelEncoder()
X_train = df_train[features]
y_train = df_train['satisfaction']
y_train = le.fit_transform(y_train)

X_val = df_val[features]
y_val = df_val['satisfaction']
y_val = le.fit_transform(y_val)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((102825, 24), (25976, 24), (102825,), (25976,))

In [4]:
def objective(trial,data=X_train,target=y_train):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.3,random_state=15)
    param = {
        #'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.012,0.016,0.02]),
        'n_estimators': 1000, #as original model
        'max_depth': trial.suggest_categorical('max_depth', [5,10,15,20,25,30,37,40]),
        'random_state': 15,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    mse = mean_squared_error(test_y, preds,squared=True)
    
    return mse

In [5]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[32m[I 2023-04-18 22:15:02,020][0m A new study created in memory with name: no-name-9611f305-d09c-46da-b618-024611f3e54c[0m




[32m[I 2023-04-18 22:17:28,580][0m Trial 0 finished with value: 0.045545902489626554 and parameters: {'lambda': 0.011841232198339177, 'alpha': 0.02608989662963063, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.02, 'max_depth': 25, 'min_child_weight': 100}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:19:28,238][0m Trial 1 finished with value: 0.05384465767634855 and parameters: {'lambda': 0.007214749321128512, 'alpha': 0.9018630821941501, 'colsample_bytree': 1.0, 'subsample': 0.7, 'learning_rate': 0.016, 'max_depth': 15, 'min_child_weight': 168}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:22:40,554][0m Trial 2 finished with value: 0.04813926348547718 and parameters: {'lambda': 8.73768225889109, 'alpha': 3.3669218307330238, 'colsample_bytree': 0.9, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 37, 'min_child_weight': 102}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:25:14,323][0m Trial 3 finished with value: 0.046939834024896265 and parameters: {'lambda': 0.1635425441629915, 'alpha': 5.013256634459744, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.016, 'max_depth': 20, 'min_child_weight': 102}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:26:26,148][0m Trial 4 finished with value: 0.06629279045643154 and parameters: {'lambda': 0.3682497514820749, 'alpha': 0.03394106215383994, 'colsample_bytree': 0.7, 'subsample': 0.6, 'learning_rate': 0.008, 'max_depth': 10, 'min_child_weight': 196}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:27:56,343][0m Trial 5 finished with value: 0.06382909751037344 and parameters: {'lambda': 0.5349857474487847, 'alpha': 0.010305271197789363, 'colsample_bytree': 0.8, 'subsample': 0.7, 'learning_rate': 0.008, 'max_depth': 25, 'min_child_weight': 195}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:28:58,486][0m Trial 6 finished with value: 0.07089600622406639 and parameters: {'lambda': 0.011745536546949112, 'alpha': 0.14047155920731055, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.008, 'max_depth': 40, 'min_child_weight': 178}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:30:10,436][0m Trial 7 finished with value: 0.06989107883817428 and parameters: {'lambda': 0.3293504301787546, 'alpha': 0.10238189613777077, 'colsample_bytree': 0.6, 'subsample': 0.7, 'learning_rate': 0.008, 'max_depth': 5, 'min_child_weight': 248}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:31:32,797][0m Trial 8 finished with value: 0.06557961618257262 and parameters: {'lambda': 3.400331364636343, 'alpha': 0.0072777696019689945, 'colsample_bytree': 0.5, 'subsample': 0.6, 'learning_rate': 0.016, 'max_depth': 25, 'min_child_weight': 277}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:32:58,362][0m Trial 9 finished with value: 0.06697354771784232 and parameters: {'lambda': 4.479323810073782, 'alpha': 0.35506676834326445, 'colsample_bytree': 0.8, 'subsample': 0.4, 'learning_rate': 0.012, 'max_depth': 40, 'min_child_weight': 180}. Best is trial 0 with value: 0.045545902489626554.[0m




[32m[I 2023-04-18 22:35:23,973][0m Trial 10 finished with value: 0.04126685684647303 and parameters: {'lambda': 0.0012415947786318126, 'alpha': 0.0017034782532461131, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 5}. Best is trial 10 with value: 0.04126685684647303.[0m




[32m[I 2023-04-18 22:37:42,574][0m Trial 11 finished with value: 0.04152619294605809 and parameters: {'lambda': 0.0010274900980701673, 'alpha': 0.0011964765166066482, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 9}. Best is trial 10 with value: 0.04126685684647303.[0m




[32m[I 2023-04-18 22:41:18,706][0m Trial 12 finished with value: 0.04149377593360996 and parameters: {'lambda': 0.0010588191533163964, 'alpha': 0.0010277148761680898, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 1}. Best is trial 10 with value: 0.04126685684647303.[0m




[32m[I 2023-04-18 22:43:58,673][0m Trial 13 finished with value: 0.04220695020746888 and parameters: {'lambda': 0.0011118057087605138, 'alpha': 0.0010636116951808548, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 12}. Best is trial 10 with value: 0.04126685684647303.[0m




[32m[I 2023-04-18 22:46:53,890][0m Trial 14 finished with value: 0.04603215767634855 and parameters: {'lambda': 0.0034481933953811384, 'alpha': 0.0030204673018620343, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 47}. Best is trial 10 with value: 0.04126685684647303.[0m




[32m[I 2023-04-18 22:50:09,055][0m Trial 15 finished with value: 0.04431405601659751 and parameters: {'lambda': 0.026652571081849128, 'alpha': 0.0035891763313389783, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 51}. Best is trial 10 with value: 0.04126685684647303.[0m




[32m[I 2023-04-18 22:52:30,463][0m Trial 16 finished with value: 0.04723158713692946 and parameters: {'lambda': 0.0026202635492148115, 'alpha': 0.0010098833817686749, 'colsample_bytree': 0.9, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 66}. Best is trial 10 with value: 0.04126685684647303.[0m




[32m[I 2023-04-18 22:56:36,633][0m Trial 17 finished with value: 0.0409426867219917 and parameters: {'lambda': 0.035090977802052566, 'alpha': 0.004031598747017301, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 15, 'min_child_weight': 3}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 22:59:18,019][0m Trial 18 finished with value: 0.04911177385892116 and parameters: {'lambda': 0.04469538406907935, 'alpha': 0.013460059655596674, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.02, 'max_depth': 15, 'min_child_weight': 136}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 23:02:27,470][0m Trial 19 finished with value: 0.04629149377593361 and parameters: {'lambda': 0.04512239216258642, 'alpha': 0.0036961381392825697, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.012, 'max_depth': 15, 'min_child_weight': 38}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 23:03:38,454][0m Trial 20 finished with value: 0.050765041493775934 and parameters: {'lambda': 0.004143685016439402, 'alpha': 0.031561211743647476, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 5, 'min_child_weight': 76}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 23:06:30,802][0m Trial 21 finished with value: 0.04155860995850622 and parameters: {'lambda': 0.00186759099211555, 'alpha': 0.0024142840484578385, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 37, 'min_child_weight': 1}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 23:08:39,447][0m Trial 22 finished with value: 0.04424922199170125 and parameters: {'lambda': 0.0062457212761915775, 'alpha': 0.005373925448864852, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 20, 'min_child_weight': 29}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 23:10:26,540][0m Trial 23 finished with value: 0.04350363070539419 and parameters: {'lambda': 0.001388288688818988, 'alpha': 0.0017574537866965946, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 10, 'min_child_weight': 22}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 23:12:31,114][0m Trial 24 finished with value: 0.04836618257261411 and parameters: {'lambda': 0.002126094867292863, 'alpha': 0.0028038262545018056, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 15, 'min_child_weight': 79}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 23:14:08,302][0m Trial 25 finished with value: 0.054752334024896265 and parameters: {'lambda': 0.0037718350825321984, 'alpha': 0.0061738163446236555, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 131}. Best is trial 17 with value: 0.0409426867219917.[0m




[32m[I 2023-04-18 23:16:30,236][0m Trial 26 finished with value: 0.040877852697095436 and parameters: {'lambda': 0.013897993486708967, 'alpha': 0.0018108315682244345, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 2}. Best is trial 26 with value: 0.040877852697095436.[0m




[32m[I 2023-04-18 23:18:07,943][0m Trial 27 finished with value: 0.04985736514522822 and parameters: {'lambda': 0.02008463010407016, 'alpha': 0.002083341777684316, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.016, 'max_depth': 15, 'min_child_weight': 55}. Best is trial 26 with value: 0.040877852697095436.[0m




[32m[I 2023-04-18 23:19:51,894][0m Trial 28 finished with value: 0.04635632780082988 and parameters: {'lambda': 0.06291215015520511, 'alpha': 0.013254370121914003, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.012, 'max_depth': 30, 'min_child_weight': 33}. Best is trial 26 with value: 0.040877852697095436.[0m




[32m[I 2023-04-18 23:21:07,326][0m Trial 29 finished with value: 0.05329356846473029 and parameters: {'lambda': 0.010218259985044135, 'alpha': 0.018986291778773066, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.02, 'max_depth': 40, 'min_child_weight': 91}. Best is trial 26 with value: 0.040877852697095436.[0m


In [6]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_alpha,params_colsample_bytree,params_lambda,params_learning_rate,params_max_depth,params_min_child_weight,params_subsample,state
0,0,0.045546,2023-04-18 22:15:02.027401,2023-04-18 22:17:28.577840,0 days 00:02:26.550439,0.02609,0.8,0.011841,0.02,25,100,1.0,COMPLETE
1,1,0.053845,2023-04-18 22:17:28.587321,2023-04-18 22:19:28.229887,0 days 00:01:59.642566,0.901863,1.0,0.007215,0.016,15,168,0.7,COMPLETE
2,2,0.048139,2023-04-18 22:19:28.246886,2023-04-18 22:22:40.553759,0 days 00:03:12.306873,3.366922,0.9,8.737682,0.012,37,102,1.0,COMPLETE
3,3,0.04694,2023-04-18 22:22:40.559012,2023-04-18 22:25:14.319617,0 days 00:02:33.760605,5.013257,0.8,0.163543,0.016,20,102,1.0,COMPLETE
4,4,0.066293,2023-04-18 22:25:14.333812,2023-04-18 22:26:26.147660,0 days 00:01:11.813848,0.033941,0.7,0.36825,0.008,10,196,0.6,COMPLETE
5,5,0.063829,2023-04-18 22:26:26.149206,2023-04-18 22:27:56.337313,0 days 00:01:30.188107,0.010305,0.8,0.534986,0.008,25,195,0.7,COMPLETE
6,6,0.070896,2023-04-18 22:27:56.344665,2023-04-18 22:28:58.485312,0 days 00:01:02.140647,0.140472,0.6,0.011746,0.008,40,178,0.4,COMPLETE
7,7,0.069891,2023-04-18 22:28:58.494675,2023-04-18 22:30:10.434716,0 days 00:01:11.940041,0.102382,0.6,0.32935,0.008,5,248,0.7,COMPLETE
8,8,0.06558,2023-04-18 22:30:10.444566,2023-04-18 22:31:32.795384,0 days 00:01:22.350818,0.007278,0.5,3.400331,0.016,25,277,0.6,COMPLETE
9,9,0.066974,2023-04-18 22:31:32.806865,2023-04-18 22:32:58.361006,0 days 00:01:25.554141,0.355067,0.8,4.479324,0.012,40,180,0.4,COMPLETE


In [13]:
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best MSE:', study.best_value)

Number of finished trials: 30
Best trial: {'lambda': 0.013897993486708967, 'alpha': 0.0018108315682244345, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.02, 'max_depth': 30, 'min_child_weight': 2}
Best MSE: 0.040877852697095436


{'lambda': 0.0031790320630051537, 'alpha': 0.11407047947985488, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 20, 'random_state': 2020, 'min_child_weight': 19}

In [10]:
final_model = xgb.XGBClassifier(**study.best_trial.params)
final_model.fit(X_train, y_train)
y_val_pred = final_model.predict(X_val)
mean_squared_error(y_val, y_val_pred,squared=True)



0.0446181090237142