In [1]:
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.preprocessing import StandardScaler
from xgboost import plot_importance
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np
import time
import os

In [2]:
# Set option to display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.6f}'.format)

In [3]:
categorical_features = [
    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",

    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
]

numerical_features = [
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",

    "CRS_DEP_TIME",
    "DEP_TIME_BLK",
    "CRS_ARR_TIME", 
    "ARR_TIME_BLK",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DISTANCE_GROUP",
]

target = "DEP_DELAY_GROUP"

In [4]:
# Get the current script's directory
current_script_dir = os.getcwd()

# Move up to the parent directory
parent_dir = os.path.dirname(current_script_dir)

# Define the sibling data directory name
data_dir = os.path.join(parent_dir, 'data')

In [5]:
def set_categorical_dtype_to_str(
    dataframe: pd.DataFrame, 
    categorical_columns: list[str]
) -> pd.DataFrame:
    df = dataframe.copy()
    for column in categorical_columns:
        df[column] = df[column].astype('category')
    return df 

In [6]:
# Load the 2022 training datset
# Use os.path.join to create the full file path
training_dataset_filename = "encoded_training_dataset_2022.csv"
file_path = os.path.join(data_dir, training_dataset_filename)

# Read the CSV file into a DataFrame
dataset_df = pd.read_csv(file_path)
training_df = dataset_df[categorical_features+numerical_features+[target]].copy()

# Set dtype of categorical features to category
training_df = set_categorical_dtype_to_str(
    dataframe=training_df, 
    categorical_columns=categorical_features
)

# Display the DataFrame
display(training_df.describe())

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY_GROUP
count,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0
mean,2.525611,6.575482,15.731372,3.976177,1328.219585,8.02403,1486.236314,9.655784,143.059972,817.382685,3.74023,0.182563
std,1.103255,3.387474,8.757283,2.003274,491.535391,4.865378,519.240116,4.977613,72.606267,597.402761,2.341283,2.299142
min,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,31.0,1.0,-2.0
25%,2.0,4.0,8.0,2.0,910.0,4.0,1101.0,6.0,90.0,386.0,2.0,-1.0
50%,3.0,7.0,16.0,4.0,1318.0,8.0,1512.0,10.0,125.0,666.0,3.0,-1.0
75%,3.0,9.0,23.0,6.0,1735.0,12.0,1921.0,14.0,174.0,1052.0,5.0,0.0
max,4.0,12.0,31.0,7.0,2359.0,18.0,2400.0,18.0,690.0,5095.0,11.0,12.0


In [7]:
# Function to score model using Huber Loss
def huber_loss(y_true, y_pred):
    delta = np.percentile(y_true, 95)
    huber_loss = np.mean(np.where(np.abs(np.array(y_true) -  np.array(y_pred)) <= delta, 
                                  0.5 * np.square(np.array(y_true) -  np.array(y_pred)), 
                                  delta * (np.abs(np.array(y_true) -  np.array(y_pred)) - 0.5 * delta)))
    return huber_loss
    
# Wrap the huber_loss function as a scorer for use in GridSearchCV
huber_scorer = make_scorer(huber_loss, greater_is_better=False)

# Make cross validation strategy
cv_strategy = KFold(n_splits=2, shuffle=True, random_state=123)

In [8]:
# Start timer
start_time = time.time()

param_grid = {
    'min_child_weight': [8, 9, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

model = xgb.XGBRegressor(
    enable_categorical=True,
    objective='reg:pseudohubererror',
    random_state=123
)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=huber_scorer,
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1
)

# Fit the Grid Search with the custom Huber loss as eval_metric
grid_search.fit(
    training_df[categorical_features+numerical_features].values, 
    training_df[target].values
)

best_params = grid_search.best_params_
print("Best parameters:", best_params)
print("Best Huber Loss Score:", -grid_search.best_score_)

# End timer
end_time = time.time()
# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")

Fitting 2 folds for each of 108 candidates, totalling 216 fits


KeyboardInterrupt: 