In [1]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb
import pandas as pd
import numpy as np
import os

In [2]:
# Set option to display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.6f}'.format)

In [3]:
categorical_features = [
    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",

    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
]

numerical_features = [
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",

    "CRS_DEP_TIME",
    "DEP_TIME_BLK",
    "CRS_ARR_TIME", 
    "ARR_TIME_BLK",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DISTANCE_GROUP",
]

target = "DEP_DELAY"

In [4]:
# Get the current script's directory
current_script_dir = os.getcwd()

# Move up to the parent directory
parent_dir = os.path.dirname(current_script_dir)

# Define the sibling data directory name
data_dir = os.path.join(parent_dir, 'data')

In [5]:
def set_categorical_dtype_to_str(
    dataframe: pd.DataFrame, 
    categorical_columns: list[str]
) -> pd.DataFrame:
    df = dataframe.copy()
    for column in categorical_columns:
        df[column] = df[column].astype('category')
    return df 

In [6]:
# Load the 2022 training datset
# Use os.path.join to create the full file path
training_dataset_filename = "encoded_training_dataset_2022.csv"
file_path = os.path.join(data_dir, training_dataset_filename)

# Read the CSV file into a DataFrame
dataset_df = pd.read_csv(file_path)
training_df = dataset_df[categorical_features+numerical_features+[target]].copy()

# Set dtype of categorical features to category
training_df = set_categorical_dtype_to_str(
    dataframe=training_df, 
    categorical_columns=categorical_features
)

# Display the DataFrame
display(training_df.describe())

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY
count,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0
mean,2.525611,6.575482,15.731372,3.976177,1328.219585,8.02403,1486.236314,9.655784,143.059972,817.382685,3.74023,12.603955
std,1.103255,3.387474,8.757283,2.003274,491.535391,4.865378,519.240116,4.977613,72.606267,597.402761,2.341283,52.502743
min,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,31.0,1.0,-96.0
25%,2.0,4.0,8.0,2.0,910.0,4.0,1101.0,6.0,90.0,386.0,2.0,-5.0
50%,3.0,7.0,16.0,4.0,1318.0,8.0,1512.0,10.0,125.0,666.0,3.0,-2.0
75%,3.0,9.0,23.0,6.0,1735.0,12.0,1921.0,14.0,174.0,1052.0,5.0,10.0
max,4.0,12.0,31.0,7.0,2359.0,18.0,2400.0,18.0,690.0,5095.0,11.0,3433.0


In [7]:
# Load the 2023 testing datset
# Use os.path.join to create the full file path
testing_dataset_filename = "encoded_evaluation_dataset_2023.csv"
file_path = os.path.join(data_dir, testing_dataset_filename)

# Read the CSV file into a DataFrame
dataset_df = pd.read_csv(file_path)
testing_df = dataset_df[categorical_features+numerical_features+[target]].copy()

# Set dtype of categorical features to category
testing_df = set_categorical_dtype_to_str(
    dataframe=testing_df, 
    categorical_columns=categorical_features
)

# Display the DataFrame
display(testing_df.describe())

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY
count,6763366.0,6763366.0,6763366.0,6763366.0,6763366.0,6763366.0,6763366.0,6763366.0,6763366.0,6763366.0,6763366.0,6763366.0
mean,2.533025,6.599655,15.75067,3.983114,1331.043765,8.05042,1486.516154,9.665837,145.934768,833.977916,3.807604,12.285409
std,1.109902,3.412189,8.766607,2.00174,496.394849,4.914447,525.996117,5.026481,73.09289,599.873971,2.351688,55.328527
min,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,21.0,1.0,-99.0
25%,2.0,4.0,8.0,2.0,909.0,4.0,1059.0,5.0,92.0,399.0,2.0,-5.0
50%,3.0,7.0,16.0,4.0,1321.0,8.0,1515.0,10.0,129.0,679.0,3.0,-2.0
75%,4.0,10.0,23.0,6.0,1740.0,12.0,1925.0,14.0,176.0,1068.0,5.0,9.0
max,4.0,12.0,31.0,7.0,2359.0,18.0,2400.0,18.0,1162.0,5095.0,11.0,4413.0


## Scaling

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Initialize the StandardScaler
scaler = MinMaxScaler()

# Select only numerical columns
#numerical_cols = training_df.select_dtypes(include=['float64', 'int64']).columns

# Apply StandardScaler to the numerical columns
scaled_training_df = training_df.copy()
scaled_training_df[numerical_features] = scaler.fit_transform(training_df[numerical_features])

display(scaled_training_df.describe())

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY
count,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0,6551773.0
mean,0.508537,0.506862,0.491046,0.496029,0.562858,0.445779,0.619106,0.536432,0.206183,0.155289,0.274023,12.603955
std,0.367752,0.307952,0.291909,0.333879,0.208454,0.270299,0.21644,0.276534,0.105379,0.117971,0.234128,52.502743
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-96.0
25%,0.333333,0.272727,0.233333,0.166667,0.385496,0.222222,0.458524,0.333333,0.129173,0.070103,0.1,-5.0
50%,0.666667,0.545455,0.5,0.5,0.558524,0.444444,0.629846,0.555556,0.179971,0.125395,0.2,-2.0
75%,0.666667,0.727273,0.733333,0.833333,0.735369,0.666667,0.800333,0.777778,0.251089,0.201619,0.4,10.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3433.0


## K-Fold Validation and Evaluation

In [9]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [10]:
def evaluate_model(data_df, features_columns, target_column):
    X = data_df[features_columns].values
    y = data_df[target_column].values
    
    rmse_scores, mae_scores = [], []

    kf = KFold(n_splits=2, shuffle=True, random_state=123)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = xgb.XGBRegressor(
            enable_categorical=True,
            objective='reg:squarederror', 
            eval_metric='rmse',
            random_state=123
        )
        model.fit(X_train, y_train)  

        y_pred = model.predict(X_test)
        
        rmse_scores.append(rmse(y_test, y_pred))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        
    return np.mean(rmse_scores), np.mean(mae_scores)

In [14]:
import time

# Start time
start_time = time.time()

rmse_score, mae_score = evaluate_model(
    data_df=scaled_training_df, 
    features_columns=categorical_features+numerical_features, 
    target_column=target
)

print(f"Evaluation: Root Mean Squared Error: {rmse_score}")
print(f"Evaluation: Mean Absolute Error: {mae_score}")

# End time
end_time = time.time()

# Calculate elapsed time
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")

Evaluation: Root Mean Squared Error: 51.03979889388084
Evaluation: Mean Absolute Error: 21.367758823934853
Execution time: 23.15353226661682 seconds
