In [2]:
%%time
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier  
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials

# Load and read the dataset
data = pd.read_csv(r"D:\users\m_ozdemir15\Desktop\Okul\Y.Lisans\2.Dönem\Makine Öğrenmesi\proje\AirlinesDelayed.csv")
data.head()

# Data preprocessing

# Handle missing values, if necessary remove them
data.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Airline', 'AirportFrom', 'AirportTo']
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Split the data into X (features) and y (target)
X = data.drop('Delay', axis=1)
y = data['Delay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the objective function for Hyperopt
def objective(params):
    model = XGBClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return {'loss': -accuracy, 'status': STATUS_OK}

# Define the hyperparameter space for Hyperopt
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'n_estimators': hp.choice('n_estimators', range(50, 150)),
    'max_depth': hp.choice('max_depth', range(5, 15)),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1)
}

# Run Bayesian optimization using Hyperopt
trials = Trials()
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=100,
                        trials=trials)

print(f'Best hyperparameters: {best_hyperparams}')

# Train and evaluate the model with the best hyperparameters found by Hyperopt
model_xgb_optimized = XGBClassifier(**best_hyperparams)
model_xgb_optimized.fit(X_train, y_train)
y_pred_xgb_optimized = model_xgb_optimized.predict(X_test)
accuracy_xgb_optimized = accuracy_score(y_test, y_pred_xgb_optimized)
report_xgb_optimized = classification_report(y_test, y_pred_xgb_optimized)

print(f'XGBoost Model with Optimized Hyperparameters Accuracy: {accuracy_xgb_optimized}')
print(report_xgb_optimized)


100%|██████████████████████████████████████████████| 100/100 [15:42<00:00,  9.43s/trial, best loss: -0.711949720515031]
Best hyperparameters: {'colsample_bytree': 0.6250475737349662, 'learning_rate': 0.07472819415702454, 'max_depth': 9, 'n_estimators': 76, 'subsample': 0.9373479281203446}
XGBoost Model with Optimized Hyperparameters Accuracy: 0.6962373814622209
              precision    recall  f1-score   support

           0       0.69      0.81      0.75     59879
           1       0.70      0.55      0.62     47998

    accuracy                           0.70    107877
   macro avg       0.70      0.68      0.68    107877
weighted avg       0.70      0.70      0.69    107877

CPU times: total: 1h 32min 41s
Wall time: 15min 51s
