In [None]:
%%time
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier  
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load and read the dataset
data = pd.read_csv(r"D:\users\m_ozdemir15\Desktop\Okul\Y.Lisans\2.Dönem\Makine Öğrenmesi\proje\AirlinesDelayed.csv")
data.head()

# Data preprocessing

# Handle missing values, if necessary remove them
data.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = ['Airline', 'AirportFrom', 'AirportTo']
for column in categorical_columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Split the data into X (features) and y (target)
X = data.drop('Delay', axis=1)
y = data['Delay']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'learning_rate': np.linspace(0.1, 0.5),
    'max_depth': range(6, 16),
    'n_estimators': range(50, 150),
    'subsample': np.linspace(0.7, 1.0),
    'colsample_bytree': np.linspace(0.7, 1.0)
}

# Create a base model
xgb = XGBClassifier()

# Instantiate the randomized search model
rand_search = RandomizedSearchCV(estimator=xgb,
                                 param_distributions=param_dist,
                                 n_iter=100,
                                 cv=3,
                                 n_jobs=-1,
                                 verbose=2)

# Fit the randomized search to the data
rand_search.fit(X_train, y_train)

# Get the best parameters from the RandomizedSearchCV
best_params = rand_search.best_params_
print(f'Best parameters: {best_params}')

# Train and evaluate the model with the best parameters from RandomizedSearchCV
model_xgb_optimized = XGBClassifier(**best_params)
model_xgb_optimized.fit(X_train, y_train)
y_pred_xgb_optimized = model_xgb_optimized.predict(X_test)
accuracy_xgb_optimized = accuracy_score(y_test, y_pred_xgb_optimized)
report_xgb_optimized = classification_report(y_test, y_pred_xgb_optimized)

print(f'XGBoost Model with Optimized Hyperparameters Accuracy: {accuracy_xgb_optimized}')
print(report_xgb_optimized)
