https://www.kaggle.com/code/lucamassaron/tutorial-bayesian-optimization-with-xgboost#Setting-up-optimization

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('../data/Preprocessed_data_with_date/airplane_train_processed_date.csv')
df

Unnamed: 0,Departure Delay in Minutes,Arrival Delay in Minutes,Gender_Female,Gender_Male,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,...,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Age,Flight Distance,satisfaction,Date
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,4.0,4.0,4.0,4.0,4.0,2.0,43,508,neutral or dissatisfied,2017-01-01
1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,2.0,1.0,3.0,3.0,2.0,4.0,34,199,neutral or dissatisfied,2017-01-01
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,5.0,5.0,5.0,4.0,5.0,5.0,54,2917,satisfied,2017-01-01
3,36.0,27.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,5.0,5.0,5.0,5.0,5.0,5.0,57,270,satisfied,2017-01-01
4,0.0,5.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,1.0,2.0,4.0,4.0,3.0,5.0,58,308,neutral or dissatisfied,2017-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102820,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,5.0,3.0,5.0,4.0,5.0,5.0,35,158,neutral or dissatisfied,2022-12-31
102821,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,2.0,5.0,3.0,4.0,4.0,4.0,38,1023,neutral or dissatisfied,2022-12-31
102822,17.0,19.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,2.0,2.0,2.0,5.0,2.0,4.0,54,187,neutral or dissatisfied,2022-12-31
102823,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,5.0,2.0,5.0,5.0,5.0,4.0,34,337,neutral or dissatisfied,2022-12-31


In [21]:
features = ['Gender_Female', 'Customer Type_Loyal Customer',
       'Type of Travel_Business travel',
       'Type of Travel_Personal Travel', 'Class_Business',
       'Class_Eco', 'Age',
       'Flight Distance', 'Departure Delay in Minutes',
       'Arrival Delay in Minutes', 'Inflight wifi service',
       'Departure/Arrival time convenient',
       'Ease of Online booking', 'Gate location',
       'Food and drink', 'Online boarding',
       'Seat comfort', 'Inflight entertainment',
       'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service',
       'Inflight service', 'Cleanliness']

In [22]:
X = df[features]
le = LabelEncoder()
y = df['satisfaction']
y = le.fit_transform(y)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((71977, 24), (30848, 24), (71977,), (30848,))

In [24]:
#Space set close to values estimated by TPOT classifier
search_space = {'criterion': hp.choice('criterion', ['entropy', 'gini']),
        'bootstrap': hp.choice('bootstrap', [True, False]),
        'max_features': hp.choice('max_features', [0.15, 0.20, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]),
        'min_samples_leaf': hp.uniform('min_samples_leaf', 0, 0.5),
        'min_samples_split' : hp.choice ('min_samples_split', [5, 6, 7, 8, 9]),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750])
    }
search_space

{'criterion': <hyperopt.pyll.base.Apply at 0x12b312a60>,
 'bootstrap': <hyperopt.pyll.base.Apply at 0x12b312220>,
 'max_features': <hyperopt.pyll.base.Apply at 0x12b312dc0>,
 'min_samples_leaf': <hyperopt.pyll.base.Apply at 0x10ca53e50>,
 'min_samples_split': <hyperopt.pyll.base.Apply at 0x10ca53d00>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x13589e610>}

In [28]:
params = {'colsample_bytree': 0.6911920435612005, 'gamma': 8.593324118055857, 'max_depth': 37, 
          'min_child_weight': 9.0, 'reg_alpha': 72.0, 'reg_lambda': 0.7966579413290078}
     
clf = xgb.XGBClassifier(n_estimators=1000, max_depth=int(params['max_depth']), gamma=params['gamma'],
                        reg_alpha=params['reg_alpha'], min_child_weight=params['min_child_weight'],
                        colsample_bytree=params['colsample_bytree'])

## TPOT
https://www.kaggle.com/code/thebrownviking20/tpot-a-great-tool-to-automate-your-ml-workflow

In [29]:
from tpot import TPOTClassifier

In [30]:
tpot = TPOTClassifier(generations=8, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print("Accuracy is {}%".format(tpot.score(X_test, y_test)*100))

Optimization Progress:   0%|          | 0/450 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.9574169825067187

Generation 2 - Current best internal CV score: 0.9574169825067187

Generation 3 - Current best internal CV score: 0.9581394314599179

Generation 4 - Current best internal CV score: 0.9581394314599179

Generation 5 - Current best internal CV score: 0.9581394314599179

Generation 6 - Current best internal CV score: 0.9581394314599179

Generation 7 - Current best internal CV score: 0.9589730407340775

Generation 8 - Current best internal CV score: 0.9589730407340775

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.5, min_samples_leaf=4, min_samples_split=6, n_estimators=100)
Accuracy is 96.01270746887967%


In [31]:
tpot.export('tpot_pipeline.py')