## This notebooks show that we can reduce RMSE and MAE by 
- Limiting the departure delay from -15 to 179, and
- Assigning one category for early departure less than -15, and
- Assigning one category for departure delays larger than 179

## Importing the need packages

In [1]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import xgboost as xgb
import pandas as pd
import numpy as np
import os

In [2]:
# Set option to display all columns
pd.set_option('display.max_columns', None)

## Defining features and target(s)

In [3]:
features = [
    "QUARTER",
    "MONTH",
    "DAY_OF_MONTH",
    "DAY_OF_WEEK",

    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",

    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
    
    "CRS_DEP_TIME",
    "DEP_TIME_BLK",
    "CRS_ARR_TIME",
    "ARR_TIME_BLK",
    "CRS_ELAPSED_TIME",
    "DISTANCE",
    "DISTANCE_GROUP",
]

target = "DEP_DELAY"

## Importing the training datasets

In [4]:
# Get the current script's directory
current_script_dir = os.getcwd()

# Move up to the parent directory
parent_dir = os.path.dirname(current_script_dir)

# Define the sibling data directory name
data_dir = os.path.join(parent_dir, 'data')

# Load the 2022 training datset
# Use os.path.join to create the full file path
training_dataset_filename = "encoded_training_dataset_2022.csv"
file_path = os.path.join(data_dir, training_dataset_filename)

# Read the CSV file into a DataFrame
dataset_df = pd.read_csv(file_path)
training_df = dataset_df[features+ [target]].copy()

# Display the DataFrame
display(training_df)

Unnamed: 0,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_CITY_MARKET_ID,ORIGIN_STATE_FIPS,ORIGIN_WAC,DEST_AIRPORT_ID,DEST_CITY_MARKET_ID,DEST_STATE_FIPS,DEST_WAC,CRS_DEP_TIME,DEP_TIME_BLK,CRS_ARR_TIME,ARR_TIME_BLK,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,DEP_DELAY
0,1,1,1,6,20363,360,4732,10135,30135,42,23,11433,31295,26,43,1015,5,1209,7,114,425,2,-1
1,1,1,1,6,20363,360,5430,11433,31295,26,43,10135,30135,42,23,1422,9,1548,10,86,425,2,-3
2,1,1,1,6,20363,414,4671,10397,30397,13,34,14783,34783,29,64,2057,15,2149,16,112,563,3,-5
3,1,1,1,6,20363,414,5009,13487,31650,27,63,11423,31423,19,61,1041,5,1153,6,72,232,1,-3
4,1,1,1,6,20363,414,5083,11423,31423,19,61,10397,30397,13,34,1300,8,1610,11,130,743,3,-3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6551768,4,12,31,6,20452,5292,5741,12953,31703,36,22,11066,31066,39,44,830,3,1038,5,128,479,2,-7
6551769,4,12,31,6,20452,6245,3607,14321,34321,23,12,13930,30977,17,41,635,1,758,2,143,900,4,-7
6551770,4,12,31,6,20452,6245,3686,13930,30977,17,41,14321,34321,23,12,1800,13,2129,16,149,900,4,-7
6551771,4,12,31,6,20452,6245,3699,11003,31003,19,61,13930,30977,17,41,1508,10,1627,11,79,196,1,-10


In [5]:
training_df.dtypes

QUARTER                  int64
MONTH                    int64
DAY_OF_MONTH             int64
DAY_OF_WEEK              int64
OP_CARRIER_AIRLINE_ID    int64
TAIL_NUM                 int64
OP_CARRIER_FL_NUM        int64
ORIGIN_AIRPORT_ID        int64
ORIGIN_CITY_MARKET_ID    int64
ORIGIN_STATE_FIPS        int64
ORIGIN_WAC               int64
DEST_AIRPORT_ID          int64
DEST_CITY_MARKET_ID      int64
DEST_STATE_FIPS          int64
DEST_WAC                 int64
CRS_DEP_TIME             int64
DEP_TIME_BLK             int64
CRS_ARR_TIME             int64
ARR_TIME_BLK             int64
CRS_ELAPSED_TIME         int64
DISTANCE                 int64
DISTANCE_GROUP           int64
DEP_DELAY                int64
dtype: object

In [6]:
categorical_columns = [
    "OP_CARRIER_AIRLINE_ID",
    "TAIL_NUM",
    "OP_CARRIER_FL_NUM",

    "ORIGIN_AIRPORT_ID",
    "ORIGIN_CITY_MARKET_ID",
    "ORIGIN_STATE_FIPS",
    "ORIGIN_WAC",
    "DEST_AIRPORT_ID",
    "DEST_CITY_MARKET_ID",
    "DEST_STATE_FIPS",
    "DEST_WAC",
]

for column in categorical_columns:
    training_df[column] = training_df[column].astype('category')

In [7]:
training_df.dtypes

QUARTER                     int64
MONTH                       int64
DAY_OF_MONTH                int64
DAY_OF_WEEK                 int64
OP_CARRIER_AIRLINE_ID    category
TAIL_NUM                 category
OP_CARRIER_FL_NUM        category
ORIGIN_AIRPORT_ID        category
ORIGIN_CITY_MARKET_ID    category
ORIGIN_STATE_FIPS        category
ORIGIN_WAC               category
DEST_AIRPORT_ID          category
DEST_CITY_MARKET_ID      category
DEST_STATE_FIPS          category
DEST_WAC                 category
CRS_DEP_TIME                int64
DEP_TIME_BLK                int64
CRS_ARR_TIME                int64
ARR_TIME_BLK                int64
CRS_ELAPSED_TIME            int64
DISTANCE                    int64
DISTANCE_GROUP              int64
DEP_DELAY                   int64
dtype: object

## Defining functions for model evaluation

In [8]:
def evaluate_model(data_df, features_columns, target_column, number_of_classes):
    features = data_df[features_columns]
    target = data_df[target_column]

    accuracy_scores = []
    recall_scores = []
    precision_scores = []
    f1_scores = []
    
    kf = KFold(n_splits=2, shuffle=True, random_state=123)
    
    for train_index, test_index in kf.split(features.values):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        model = xgb.XGBClassifier(
            enable_categorical=True,
            objective='multi:softmax', # multi:softmax used for multi-class
            num_class=number_of_classes, 
            eval_metric='merror',
            random_state=123,
        )

        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred, average='weighted')  # 'micro', 'macro', 'weighted'
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted')

        accuracy_scores.append(accuracy)
        recall_scores.append(recall)
        precision_scores.append(precision)
        f1_scores.append(f1)
        
    return np.mean(accuracy_scores), np.mean(recall_scores), np.mean(precision_scores), np.mean(f1_scores)

## Training and Evaluation using 2-fold Cross Validation

## 15 minutes delay groups

In [9]:
# Function to categorize DEP_DELAY_GROUP values
def categorize_delay_group(group):
    return group+2

In [10]:
target_new = 'DEP_DELAY_Classes'

dataset_df[target_new] = dataset_df['DEP_DELAY_GROUP'].apply(lambda x: categorize_delay_group(x))

training_df_new = dataset_df[features+[target_new]]

acc_score, re_score, pre_score, f1 = evaluate_model(
    data_df=training_df_new, 
    features_columns=features, 
    target_column=target_new,
    number_of_classes=len(training_df_new[target_new].unique())
)

print("Evaluation: Accuracy Score:", acc_score)
print("Evaluation: Recall Score:", re_score)
print("Evaluation: Precision Score:", pre_score)
print("Evaluation: F1 Score:", f1)

Evaluation: Accuracy Score: 0.5753027462959365
Evaluation: Recall Score: 0.5753027462959365
Evaluation: Precision Score: 0.45459552208305387
Evaluation: F1 Score: 0.47502272996586914


## 30 minutes delay groups

In [11]:
# Function to categorize DEP_DELAY values
def categorize_minutes(minutes_value):
    if minutes_value < -15:
        return 0
    elif minutes_value >= -15 and minutes_value < 0:
        return 1
    elif minutes_value >= 0 and minutes_value < 30:
        return 2
    elif minutes_value >= 30 and minutes_value < 60:
        return 3
    elif minutes_value >= 60 and minutes_value < 90:
        return 4
    elif minutes_value >= 90 and minutes_value < 120:
        return 5
    elif minutes_value >= 120 and minutes_value < 150:
        return 6
    elif minutes_value >= 150 and minutes_value < 180:
        return 7
    elif minutes_value >= 180:
        return 8

In [12]:
target_new = 'DEP_DELAY_Classes'

dataset_df[target_new] = dataset_df['DEP_DELAY'].apply(lambda x: categorize_minutes(x))

training_df_new = dataset_df[features+ [target_new]]

acc_score, re_score, pre_score, f1 = evaluate_model(
    data_df=training_df_new, 
    features_columns=features, 
    target_column=target_new,
    number_of_classes=8
)
        
print("Evaluation: Accuracy Score:", acc_score)
print("Evaluation: Recall Score:", re_score)
print("Evaluation: Precision Score:", pre_score)
print("Evaluation: F1 Score:", f1)

Evaluation: Accuracy Score: 0.6047434793755357
Evaluation: Recall Score: 0.6047434793755357
Evaluation: Precision Score: 0.5351992866976009
Evaluation: F1 Score: 0.5381718417137715


## 60 minutes delay groups

In [13]:
# Function to categorize DEP_DELAY values
def categorize_minutes(minutes_value):
    if minutes_value < -15:
        return 0
    elif minutes_value >= -15 and minutes_value < 0:
        return 1
    elif minutes_value >= 0 and minutes_value < 60:
        return 2
    elif minutes_value >= 60 and minutes_value < 120:
        return 3
    elif minutes_value >= 120 and minutes_value < 180:
        return 4
    elif minutes_value >= 180:
        return 5

In [14]:
target_new = 'DEP_DELAY_Classes'

dataset_df[target_new] = dataset_df['DEP_DELAY'].apply(lambda x: categorize_minutes(x))

training_df_new = dataset_df[features+ [target_new]]

acc_score, re_score, pre_score, f1 = evaluate_model(
    data_df=training_df_new, 
    features_columns=features, 
    target_column=target_new,
    number_of_classes=6
)
        
print("Evaluation: Accuracy Score:", acc_score)
print("Evaluation: Recall Score:", re_score)
print("Evaluation: Precision Score:", pre_score)
print("Evaluation: F1 Score:", f1)

Evaluation: Accuracy Score: 0.6376139710984139
Evaluation: Recall Score: 0.6376139710984139
Evaluation: Precision Score: 0.6079699287593154
Evaluation: F1 Score: 0.6001738429947296


## 45 minutes delay groups

In [15]:
# Function to categorize DEP_DELAY values
def categorize_minutes(minutes_value):
    if minutes_value < -15:
        return 0
    elif minutes_value >= -15 and minutes_value < 0:
        return 1
    elif minutes_value >= 0 and minutes_value < 45:
        return 2
    elif minutes_value >= 45 and minutes_value < 90:
        return 3
    elif minutes_value >= 90 and minutes_value < 135:
        return 4
    elif minutes_value >= 135 and minutes_value < 180:
        return 5
    elif minutes_value >= 180:
        return 6

In [16]:
target_new = 'DEP_DELAY_Classes'

dataset_df[target_new] = dataset_df['DEP_DELAY'].apply(lambda x: categorize_minutes(x))

training_df_new = dataset_df[features+ [target_new]]

acc_score, re_score, pre_score, f1 = evaluate_model(
    data_df=training_df_new, 
    features_columns=features, 
    target_column=target_new,
    number_of_classes=7
)
        
print("Evaluation: Accuracy Score:", acc_score)
print("Evaluation: Recall Score:", re_score)
print("Evaluation: Precision Score:", pre_score)
print("Evaluation: F1 Score:", f1)

Evaluation: Accuracy Score: 0.6246061944163059
Evaluation: Recall Score: 0.6246061944163059
Evaluation: Precision Score: 0.5820147394600269
Evaluation: F1 Score: 0.5761592737766624
