## Importing the need packages

In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np
import os

In [2]:
# Set option to display all columns
pd.set_option('display.max_columns', None)

## Defining features and target(s)

In [3]:
features = [
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",

    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",

    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
    
    "CRS_DEP_TIME",
    "DEP_TIME_BLK",
    "CRS_ARR_TIME",
    "ARR_TIME_BLK",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DISTANCE_GROUP",
]

target = "DEP_DELAY_GROUP"

## Importing the training datasets

In [4]:
# Get the current script's directory
current_script_dir = os.getcwd()

# Move up to the parent directory
parent_dir = os.path.dirname(current_script_dir)

# Define the sibling data directory name
data_dir = os.path.join(parent_dir, 'data')

# Load the 2022 training datset
# Use os.path.join to create the full file path
training_dataset_filename = "encoded_training_dataset_2022.csv"
file_path = os.path.join(data_dir, training_dataset_filename)

# Read the CSV file into a DataFrame
dataset_df = pd.read_csv(file_path)
training_df = dataset_df[features+ [target]].copy()

# Display the DataFrame
display(training_df)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY_GROUP
0,1,1,1,6,20363,360,4732,10135,30135,42,23,11433,31295,26,43,1015,5,1209,7,114,425,2,-1
1,1,1,1,6,20363,360,5430,11433,31295,26,43,10135,30135,42,23,1422,9,1548,10,86,425,2,-1
2,1,1,1,6,20363,414,4671,10397,30397,13,34,14783,34783,29,64,2057,15,2149,16,112,563,3,-1
3,1,1,1,6,20363,414,5009,13487,31650,27,63,11423,31423,19,61,1041,5,1153,6,72,232,1,-1
4,1,1,1,6,20363,414,5083,11423,31423,19,61,10397,30397,13,34,1300,8,1610,11,130,743,3,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6551768,4,12,31,6,20452,5292,5741,12953,31703,36,22,11066,31066,39,44,830,3,1038,5,128,479,2,-1
6551769,4,12,31,6,20452,6245,3607,14321,34321,23,12,13930,30977,17,41,635,1,758,2,143,900,4,-1
6551770,4,12,31,6,20452,6245,3686,13930,30977,17,41,14321,34321,23,12,1800,13,2129,16,149,900,4,-1
6551771,4,12,31,6,20452,6245,3699,11003,31003,19,61,13930,30977,17,41,1508,10,1627,11,79,196,1,-1


In [5]:
training_df.dtypes

QUARTER                  int64
MONTH                    int64
DAY_OF_MONTH             int64
DAY_OF_WEEK              int64
OP_CARRIER_AIRLINE_ID    int64
TAIL_NUM                 int64
OP_CARRIER_FL_NUM        int64
ORIGIN_AIRPORT_ID        int64
ORIGIN_CITY_MARKET_ID    int64
ORIGIN_STATE_FIPS        int64
ORIGIN_WAC               int64
DEST_AIRPORT_ID          int64
DEST_CITY_MARKET_ID      int64
DEST_STATE_FIPS          int64
DEST_WAC                 int64
CRS_DEP_TIME             int64
DEP_TIME_BLK             int64
CRS_ARR_TIME             int64
ARR_TIME_BLK             int64
CRS_ELAPSED_TIME         int64
DISTANCE                 int64
DISTANCE_GROUP           int64
DEP_DELAY_GROUP          int64
dtype: object

In [6]:
categorical_columns = [
    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",

    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
]

for column in categorical_columns:
    training_df[column] = training_df[column].astype('category')

In [7]:
training_df.dtypes

QUARTER                     int64
MONTH                       int64
DAY_OF_MONTH                int64
DAY_OF_WEEK                 int64
OP_CARRIER_AIRLINE_ID    category
TAIL_NUM                 category
OP_CARRIER_FL_NUM        category
ORIGIN_AIRPORT_ID        category
ORIGIN_CITY_MARKET_ID    category
ORIGIN_STATE_FIPS        category
ORIGIN_WAC               category
DEST_AIRPORT_ID          category
DEST_CITY_MARKET_ID      category
DEST_STATE_FIPS          category
DEST_WAC                 category
CRS_DEP_TIME                int64
DEP_TIME_BLK                int64
CRS_ARR_TIME                int64
ARR_TIME_BLK                int64
CRS_ELAPSED_TIME            int64
DISTANCE                    int64
DISTANCE_GROUP              int64
DEP_DELAY_GROUP             int64
dtype: object

## Defining functions for model evaluation

In [8]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [9]:
def evaluate_model(data_df, features_columns, target_column):
    features = data_df[features_columns]
    target = data_df[target_column]
    X_array = features.values
    y_array = target.values
    
    rmse_scores = []
    mae_scores = []
    
    kf = KFold(n_splits=2, shuffle=True, random_state=123)
    
    for train_index, test_index in kf.split(X_array):
        model = xgb.XGBRegressor(
            enable_categorical=True,
            objective='reg:squarederror', 
            eval_metric='rmse',
            random_state=123
        )
        
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        rmse_scores.append(rmse(y_test, y_pred))
        mae_scores.append(mean_absolute_error(y_test, y_pred))
        
    return np.mean(rmse_scores), np.mean(mae_scores)

## Training and Evaluation using 2-fold Cross Validation

In [10]:
rmse_score, mae_score = evaluate_model(
    data_df=training_df, 
    features_columns=features, 
    target_column=target
)

In [11]:
print("Evaluation: Root Mean Squared Error:", rmse_score)
print("Evaluation: Mean Absolute Error:", mae_score)

Evaluation: Root Mean Squared Error: 2.2055223296967736
Evaluation: Mean Absolute Error: 1.3077349064815345


## Try another minute intervall in hours than the 15 minute intervall in "DEP_DELAY_GROUP"

In [12]:
def generate_categories(intervall_in_minutes: int) -> dict:
    intervalls = {}

    intervalls[-2] = {
        "min": float('-inf'),
        "max": -15
    }

    intervalls[-1] = {
        "min": -15,
        "max": 0
    }

    for start in range(0, 181, intervall_in_minutes):
        end = start + intervall_in_minutes
        category = int(start/intervall_in_minutes)
        intervalls[category] = {
            "min": start,
            "max": end if start < 180 else float('+Inf')
        }

    return intervalls

In [13]:
# Function to categorize DEP_DELAY values
def categorize_minutes(minutes_value, intervalls):
    for category, category_limits in intervalls.items():
        if minutes_value >= category_limits["min"] and minutes_value < category_limits["max"]:
            return category
    return category

In [14]:
for i in range(1, 61):
    if 60 % i == 0:
        intervall_in_minutes = int(60/i)
        print(f"{intervall_in_minutes = }")
        intervalls = generate_categories(intervall_in_minutes=intervall_in_minutes)
        target_new = f'DEP_DELAY_GROUP_NEW_{intervall_in_minutes}'
        dataset_df[target_new] = dataset_df['DEP_DELAY'].apply(lambda x: categorize_minutes(x, intervalls))
        training_df_new = dataset_df[features+ [target_new]]
        rmse_score, mae_score = evaluate_model(
            data_df=training_df_new, 
            features_columns=features, 
            target_column=target_new
        )
        print("Evaluation: Root Mean Squared Error:", rmse_score)
        print("Evaluation: Mean Absolute Error:", mae_score)
        print("----------------------------------------------------------")

intervall_in_minutes = 60
Evaluation: Root Mean Squared Error: 0.7110691075676541
Evaluation: Mean Absolute Error: 0.5317671589915098
----------------------------------------------------------
intervall_in_minutes = 30
Evaluation: Root Mean Squared Error: 1.1729194760215647
Evaluation: Mean Absolute Error: 0.7569319338043357
----------------------------------------------------------
intervall_in_minutes = 20
Evaluation: Root Mean Squared Error: 1.6644004124474585
Evaluation: Mean Absolute Error: 1.0156142685980671
----------------------------------------------------------
intervall_in_minutes = 15
Evaluation: Root Mean Squared Error: 2.165487377248664
Evaluation: Mean Absolute Error: 1.2891948850957196
----------------------------------------------------------
intervall_in_minutes = 12
Evaluation: Root Mean Squared Error: 2.667214878422806
Evaluation: Mean Absolute Error: 1.5676227140495815
----------------------------------------------------------
intervall_in_minutes = 10
Evaluation: