In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.metrics import accuracy_score, classification_report, f1_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/Preprocessed_data_with_date/airplane_train_processed_date.csv')
df

Unnamed: 0,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Female,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,...,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Age,Flight Distance,satisfaction,Date
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,4.0,4.0,4.0,4.0,4.0,2.0,43,508,neutral or dissatisfied,2017-01-01
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,2.0,1.0,3.0,3.0,2.0,4.0,34,199,neutral or dissatisfied,2017-01-01
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,5.0,5.0,5.0,4.0,5.0,5.0,54,2917,satisfied,2017-01-01
3,36.0,27.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,5.0,5.0,5.0,5.0,5.0,5.0,57,270,satisfied,2017-01-01
4,0.0,5.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,2.0,4.0,4.0,3.0,5.0,58,308,neutral or dissatisfied,2017-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102820,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,5.0,3.0,5.0,4.0,5.0,5.0,35,158,neutral or dissatisfied,2022-12-31
102821,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,2.0,5.0,3.0,4.0,4.0,4.0,38,1023,neutral or dissatisfied,2022-12-31
102822,17.0,19.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,2.0,2.0,2.0,5.0,2.0,4.0,54,187,neutral or dissatisfied,2022-12-31
102823,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,5.0,2.0,5.0,5.0,5.0,4.0,34,337,neutral or dissatisfied,2022-12-31


In [11]:
features = ['Gender_Female', 'Customer Type_Loyal Customer',
       'Type of Travel_Business travel',
       'Type of Travel_Personal Travel', 'Class_Business',
       'Class_Eco', 'Age',
       'Flight Distance', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes', 'Inflight wifi service',
       'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location',
       'Food and drink', 'Online boarding',
       'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service',
       'Inflight service', 'Cleanliness']

X = df[features]
le = LabelEncoder()
y = df['satisfaction']
y = le.fit_transform(y)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((71977, 24), (30848, 24), (71977,), (30848,))

In [20]:
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        #'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 1000,#as original model
        'max_depth': trial.suggest_categorical('max_depth', [5,10,15,20,25,30,37,40]),
        'random_state': trial.suggest_categorical('random_state', [2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = xgb.XGBClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    mse = mean_squared_error(test_y, preds,squared=True)
    
    return mse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2023-04-18 17:30:36,361][0m A new study created in memory with name: no-name-38ed015c-1f68-4c32-8086-6fb11a09b4f7[0m




[32m[I 2023-04-18 17:32:08,746][0m Trial 0 finished with value: 0.04907935684647303 and parameters: {'lambda': 0.5349338045222282, 'alpha': 3.586953490955123, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'max_depth': 25, 'random_state': 2020, 'min_child_weight': 180}. Best is trial 0 with value: 0.04907935684647303.[0m




[32m[I 2023-04-18 17:34:18,117][0m Trial 1 finished with value: 0.05342323651452282 and parameters: {'lambda': 3.900108494772715, 'alpha': 0.003294586397472065, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.012, 'max_depth': 37, 'random_state': 2020, 'min_child_weight': 251}. Best is trial 0 with value: 0.04907935684647303.[0m




[32m[I 2023-04-18 17:35:16,978][0m Trial 2 finished with value: 0.043568464730290454 and parameters: {'lambda': 0.22231191458177627, 'alpha': 0.28286174611624443, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 10}. Best is trial 2 with value: 0.043568464730290454.[0m




[32m[I 2023-04-18 17:37:14,866][0m Trial 3 finished with value: 0.056600103734439834 and parameters: {'lambda': 0.01608589563575452, 'alpha': 9.062138884913637, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.01, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 188}. Best is trial 2 with value: 0.043568464730290454.[0m




[32m[I 2023-04-18 17:38:33,511][0m Trial 4 finished with value: 0.05783195020746888 and parameters: {'lambda': 0.0021703485603195493, 'alpha': 0.0022982371452830593, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 244}. Best is trial 2 with value: 0.043568464730290454.[0m




[32m[I 2023-04-18 17:40:06,812][0m Trial 5 finished with value: 0.05044087136929461 and parameters: {'lambda': 0.0517563725002119, 'alpha': 7.021477754532125, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.018, 'max_depth': 30, 'random_state': 2020, 'min_child_weight': 146}. Best is trial 2 with value: 0.043568464730290454.[0m




[32m[I 2023-04-18 17:41:30,553][0m Trial 6 finished with value: 0.045318983402489625 and parameters: {'lambda': 0.053887367730373247, 'alpha': 1.3513848872364813, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.012, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 63}. Best is trial 2 with value: 0.043568464730290454.[0m




[32m[I 2023-04-18 17:43:21,838][0m Trial 7 finished with value: 0.04739367219917012 and parameters: {'lambda': 0.02434933747249464, 'alpha': 0.3004016981188946, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.02, 'max_depth': 15, 'random_state': 2020, 'min_child_weight': 183}. Best is trial 2 with value: 0.043568464730290454.[0m




[32m[I 2023-04-18 17:46:21,306][0m Trial 8 finished with value: 0.04240145228215768 and parameters: {'lambda': 0.003314447277042136, 'alpha': 0.009551268963513904, 'colsample_bytree': 1.0, 'subsample': 1.0, 'learning_rate': 0.02, 'max_depth': 30, 'random_state': 2020, 'min_child_weight': 99}. Best is trial 8 with value: 0.04240145228215768.[0m




In [None]:
study.trials_dataframe()