This code contains the model training and predictions for flights arrival without considering the previous flight status.

In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
# import graphviz
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [2]:
# Merging all datasets together to train a single model
# Note: Initially we thought of making individual models for each airport, so, we made different datasets.
df = pd.DataFrame()
for csvs in ["jfk_full_data_weatherbit.csv", "ord_full_data_weatherbit.csv", "mco_full_data_weatherbit.csv"]:
    df_ = pd.read_csv("jfk_full_data_weatherbit.csv")
    df = pd.concat([df, df_])

df.head()

Unnamed: 0,carrier_code,date,flight_number,tail_number,origin_airport,scheduled_arrival_time,actual_arrival_time,scheduled_elapsed_time,actual_elapsed_time,arrival_delay,...,precip_rate_y,rh_y,wind_spd_y,wind_gust_spd_y,wind_dir_y,weather_description_y,pres_y,slp_y,vis_y,snow_rate_y
0,9E,2021-01-01,4851,N296PQ,JFK,18:45,18:29,76,68,-16,...,1.0,80,2.85,5.2,seg_0,light rain,1026,1027,16,0.0
1,B6,2021-01-01,2516,N334JB,JFK,19:04,19:17,64,80,13,...,0.5,82,2.93,5.6,seg_0,overcast clouds,1026,1026,16,0.0
2,B6,2021-01-02,2516,N306JB,JFK,19:04,18:54,64,60,-10,...,0.0,57,6.32,7.2,seg_6,scattered clouds,1016,1016,16,0.0
3,9E,2021-01-02,4851,N918XJ,JFK,21:28,20:59,83,64,-29,...,0.0,64,5.8,6.4,seg_6,broken clouds,1017,1018,16,0.0
4,B6,2021-01-03,2516,N334JB,JFK,19:04,18:56,64,64,-8,...,1.0,90,5.82,10.4,seg_0,light rain,1013,1013,14,0.0


In [3]:
df.shape

(11361, 60)

In [4]:
df.columns

Index(['carrier_code', 'date', 'flight_number', 'tail_number',
       'origin_airport', 'scheduled_arrival_time', 'actual_arrival_time',
       'scheduled_elapsed_time', 'actual_elapsed_time', 'arrival_delay',
       'wheels_on_time', 'taxi_in_time', 'delay_carrier', 'delay_weather',
       'delay_national_aviation_system', 'delay_security',
       'delay_late_aircraft_arrival', 'status',
       'scheduled_arrival_time_period', 'month', 'day_of_week', 'weekend',
       'snowy_month', 'is_fed_holiday', 'datetime', 'prev_flight_delay',
       'prev_flight_time_difference', 'peak_hour', 'temp_x', 'app_temp_x',
       'clouds_x', 'precip_rate_x', 'rh_x', 'wind_spd_x', 'wind_gust_spd_x',
       'wind_dir_x', 'weather_description_x', 'pres_x', 'slp_x', 'vis_x',
       'snow_rate_x', 'destination_airport', 'scheduled_departure_time',
       'actual_departure_time', 'departure_delay', 'wheels_off_time',
       'taxi_out_time', 'temp_y', 'app_temp_y', 'clouds_y', 'precip_rate_y',
       'rh_y',

# Data Preprocessing

In [5]:
# Merge airport-wise statistical features
df_stats_airport = pd.read_csv("data/stats/airport_stats.csv")
df_stats_airport.head()

Unnamed: 0,Origin Airport,Percentage,Busiest Time
0,ALB,0.0,19:25:00
1,ATL,6.19,08:55:00
2,BNA,14.77,08:30:00
3,BOS,6.89,21:45:00
4,BWI,5.79,10:50:00


In [6]:
# Merge airlines-wise statistical features
df_stats_carrier = pd.read_csv("data/stats/carrier_code_stats.csv")
df_stats_carrier.head()

Unnamed: 0,Carrier Code,Percentage
0,9E,9.67
1,AA,19.56
2,B6,27.2
3,DL,13.7
4,EV,23.96


In [7]:
# Comvert to snake_case
def format_column_names(x: str):
    x = x.lower()
    x = x.replace(" ", "_")
    x = x.replace("-", "_")
    return x

column_names = [format_column_names(x) for x in df_stats_airport.columns]

mapper = {}
for k, v in zip(df_stats_airport.columns, column_names):
    mapper[k] = v

df_stats_airport.rename(columns=mapper, inplace=True)

column_names = [format_column_names(x) for x in df_stats_carrier.columns]

mapper = {}
for k, v in zip(df_stats_carrier.columns, column_names):
    mapper[k] = v

df_stats_carrier.rename(columns=mapper, inplace=True)

df_stats_airport.head()

df_stats_carrier.head()

Unnamed: 0,origin_airport,percentage,busiest_time
0,ALB,0.0,19:25:00
1,ATL,6.19,08:55:00
2,BNA,14.77,08:30:00
3,BOS,6.89,21:45:00
4,BWI,5.79,10:50:00


Unnamed: 0,carrier_code,percentage
0,9E,9.67
1,AA,19.56
2,B6,27.2
3,DL,13.7
4,EV,23.96


In [8]:
# merge
df = pd.merge(df, df_stats_airport, on=["origin_airport"], how="inner")
df = pd.merge(df, df_stats_carrier, on=["carrier_code"], how="inner")
df

Unnamed: 0,carrier_code,date,flight_number,tail_number,origin_airport,scheduled_arrival_time,actual_arrival_time,scheduled_elapsed_time,actual_elapsed_time,arrival_delay,...,wind_gust_spd_y,wind_dir_y,weather_description_y,pres_y,slp_y,vis_y,snow_rate_y,percentage_x,busiest_time,percentage_y
0,9E,2021-01-01,4851,N296PQ,JFK,18:45,18:29,76,68,-16,...,5.20,seg_0,light rain,1026,1027,16,0.00,8.30,22:30:00,9.67
1,9E,2021-01-02,4851,N918XJ,JFK,21:28,20:59,83,64,-29,...,6.40,seg_6,broken clouds,1017,1018,16,0.00,8.30,22:30:00,9.67
2,9E,2021-01-06,4851,N368CA,JFK,21:17,21:04,78,65,-13,...,9.60,seg_6,scattered clouds,1016,1017,16,0.00,8.30,22:30:00,9.67
3,9E,2021-01-07,4851,N305PQ,JFK,21:17,20:54,78,64,-23,...,7.20,seg_7,few clouds,1018,1019,16,0.00,8.30,22:30:00,9.67
4,9E,2021-01-08,4851,N340CA,JFK,21:17,20:53,78,59,-24,...,8.40,seg_7,overcast clouds,1011,1012,16,0.00,8.30,22:30:00,9.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11356,YX,2023-05-22,5859,N234JQ,JFK,15:03,14:36,94,72,-27,...,5.80,seg_2,overcast clouds,1021,1022,16,0.00,8.30,22:30:00,11.58
11357,YX,2023-05-23,5859,N230JQ,JFK,15:03,14:32,94,74,-31,...,8.20,seg_2,overcast clouds,1026,1027,16,0.00,8.30,22:30:00,11.58
11358,YX,2023-05-24,5859,N234JQ,JFK,15:03,14:24,94,61,-39,...,6.00,seg_3,scattered clouds,1015,1016,16,0.00,8.30,22:30:00,11.58
11359,YX,2023-06-18,5689,N243JQ,JFK,00:21,00:02,91,76,-19,...,2.40,seg_4,broken clouds,1012,1013,16,0.00,8.30,22:30:00,11.58


In [9]:
df.columns

Index(['carrier_code', 'date', 'flight_number', 'tail_number',
       'origin_airport', 'scheduled_arrival_time', 'actual_arrival_time',
       'scheduled_elapsed_time', 'actual_elapsed_time', 'arrival_delay',
       'wheels_on_time', 'taxi_in_time', 'delay_carrier', 'delay_weather',
       'delay_national_aviation_system', 'delay_security',
       'delay_late_aircraft_arrival', 'status',
       'scheduled_arrival_time_period', 'month', 'day_of_week', 'weekend',
       'snowy_month', 'is_fed_holiday', 'datetime', 'prev_flight_delay',
       'prev_flight_time_difference', 'peak_hour', 'temp_x', 'app_temp_x',
       'clouds_x', 'precip_rate_x', 'rh_x', 'wind_spd_x', 'wind_gust_spd_x',
       'wind_dir_x', 'weather_description_x', 'pres_x', 'slp_x', 'vis_x',
       'snow_rate_x', 'destination_airport', 'scheduled_departure_time',
       'actual_departure_time', 'departure_delay', 'wheels_off_time',
       'taxi_out_time', 'temp_y', 'app_temp_y', 'clouds_y', 'precip_rate_y',
       'rh_y',

In [10]:
# If the scheduled_departure_time falls within 15 min range of busiest_time, consider it as peak time
df["busiest_time"] = abs(pd.to_datetime(df["scheduled_departure_time"]) - pd.to_datetime(df["busiest_time"])) <= pd.to_timedelta(0.25, unit="hour")
df["busiest_time"] = df["busiest_time"].astype(int)
df.head()

  df["busiest_time"] = abs(pd.to_datetime(df["scheduled_departure_time"]) - pd.to_datetime(df["busiest_time"])) <= pd.to_timedelta(0.25, unit="hour")


Unnamed: 0,carrier_code,date,flight_number,tail_number,origin_airport,scheduled_arrival_time,actual_arrival_time,scheduled_elapsed_time,actual_elapsed_time,arrival_delay,...,wind_gust_spd_y,wind_dir_y,weather_description_y,pres_y,slp_y,vis_y,snow_rate_y,percentage_x,busiest_time,percentage_y
0,9E,2021-01-01,4851,N296PQ,JFK,18:45,18:29,76,68,-16,...,5.2,seg_0,light rain,1026,1027,16,0.0,8.3,0,9.67
1,9E,2021-01-02,4851,N918XJ,JFK,21:28,20:59,83,64,-29,...,6.4,seg_6,broken clouds,1017,1018,16,0.0,8.3,0,9.67
2,9E,2021-01-06,4851,N368CA,JFK,21:17,21:04,78,65,-13,...,9.6,seg_6,scattered clouds,1016,1017,16,0.0,8.3,0,9.67
3,9E,2021-01-07,4851,N305PQ,JFK,21:17,20:54,78,64,-23,...,7.2,seg_7,few clouds,1018,1019,16,0.0,8.3,0,9.67
4,9E,2021-01-08,4851,N340CA,JFK,21:17,20:53,78,59,-24,...,8.4,seg_7,overcast clouds,1011,1012,16,0.0,8.3,0,9.67


In [11]:
# Remove unnecessary features from the data
# Note: We didn't remove them in the departures data, so, we have to remove them here
unnecessary_features = ["carrier_code", "date", "flight_number", "tail_number", "origin_airport", 'scheduled_arrival_time',
       'actual_arrival_time', 
       'actual_elapsed_time', 'wheels_on_time', 'taxi_in_time', 'delay_carrier',
       'delay_weather', 'delay_national_aviation_system',
       'delay_security', 'delay_late_aircraft_arrival', "arrival_delay", "datetime", "destination_airport",
       "destination_airport", "scheduled_departure_time", "actual_departure_time", "departure_delay", "wheels_off_time", "taxi_out_time",
       ]

df.drop(columns=unnecessary_features, inplace=True)

In [12]:
df.head()
df.columns

Unnamed: 0,scheduled_elapsed_time,status,scheduled_arrival_time_period,month,day_of_week,weekend,snowy_month,is_fed_holiday,prev_flight_delay,prev_flight_time_difference,...,wind_gust_spd_y,wind_dir_y,weather_description_y,pres_y,slp_y,vis_y,snow_rate_y,percentage_x,busiest_time,percentage_y
0,76,early,evening,January,Friday,0,1,1,on_time,1,...,5.2,seg_0,light rain,1026,1027,16,0.0,8.3,0,9.67
1,83,early,night,January,Saturday,1,1,0,late,1,...,6.4,seg_6,broken clouds,1017,1018,16,0.0,8.3,0,9.67
2,78,early,night,January,Wednesday,0,1,0,late,1,...,9.6,seg_6,scattered clouds,1016,1017,16,0.0,8.3,0,9.67
3,78,early,night,January,Thursday,0,1,0,early,1,...,7.2,seg_7,few clouds,1018,1019,16,0.0,8.3,0,9.67
4,78,early,night,January,Friday,0,1,0,late,1,...,8.4,seg_7,overcast clouds,1011,1012,16,0.0,8.3,0,9.67


Index(['scheduled_elapsed_time', 'status', 'scheduled_arrival_time_period',
       'month', 'day_of_week', 'weekend', 'snowy_month', 'is_fed_holiday',
       'prev_flight_delay', 'prev_flight_time_difference', 'peak_hour',
       'temp_x', 'app_temp_x', 'clouds_x', 'precip_rate_x', 'rh_x',
       'wind_spd_x', 'wind_gust_spd_x', 'wind_dir_x', 'weather_description_x',
       'pres_x', 'slp_x', 'vis_x', 'snow_rate_x', 'temp_y', 'app_temp_y',
       'clouds_y', 'precip_rate_y', 'rh_y', 'wind_spd_y', 'wind_gust_spd_y',
       'wind_dir_y', 'weather_description_y', 'pres_y', 'slp_y', 'vis_y',
       'snow_rate_y', 'percentage_x', 'busiest_time', 'percentage_y'],
      dtype='object')

In [13]:
df.isna().sum().to_dict()

{'scheduled_elapsed_time': 0,
 'status': 0,
 'scheduled_arrival_time_period': 0,
 'month': 0,
 'day_of_week': 0,
 'weekend': 0,
 'snowy_month': 0,
 'is_fed_holiday': 0,
 'prev_flight_delay': 0,
 'prev_flight_time_difference': 0,
 'peak_hour': 0,
 'temp_x': 0,
 'app_temp_x': 0,
 'clouds_x': 0,
 'precip_rate_x': 0,
 'rh_x': 0,
 'wind_spd_x': 0,
 'wind_gust_spd_x': 0,
 'wind_dir_x': 0,
 'weather_description_x': 0,
 'pres_x': 0,
 'slp_x': 0,
 'vis_x': 0,
 'snow_rate_x': 0,
 'temp_y': 0,
 'app_temp_y': 0,
 'clouds_y': 0,
 'precip_rate_y': 0,
 'rh_y': 0,
 'wind_spd_y': 0,
 'wind_gust_spd_y': 0,
 'wind_dir_y': 0,
 'weather_description_y': 0,
 'pres_y': 0,
 'slp_y': 0,
 'vis_y': 0,
 'snow_rate_y': 0,
 'percentage_x': 0,
 'busiest_time': 0,
 'percentage_y': 0}

In [14]:
df.dropna(inplace=True)

In [15]:
df.columns

Index(['scheduled_elapsed_time', 'status', 'scheduled_arrival_time_period',
       'month', 'day_of_week', 'weekend', 'snowy_month', 'is_fed_holiday',
       'prev_flight_delay', 'prev_flight_time_difference', 'peak_hour',
       'temp_x', 'app_temp_x', 'clouds_x', 'precip_rate_x', 'rh_x',
       'wind_spd_x', 'wind_gust_spd_x', 'wind_dir_x', 'weather_description_x',
       'pres_x', 'slp_x', 'vis_x', 'snow_rate_x', 'temp_y', 'app_temp_y',
       'clouds_y', 'precip_rate_y', 'rh_y', 'wind_spd_y', 'wind_gust_spd_y',
       'wind_dir_y', 'weather_description_y', 'pres_y', 'slp_y', 'vis_y',
       'snow_rate_y', 'percentage_x', 'busiest_time', 'percentage_y'],
      dtype='object')

In [16]:
# Consider few unnecessary features as a part of feature selection
# Note: For this model, we don't want prev_flight_delay and prev_flight_time_difference columns
# as we aren't doing conditional predictions here
unnecessary_features = ["scheduled_arrival_time_period", 'prev_flight_delay', 'prev_flight_time_difference', 
                        "wind_dir_x", "wind_dir_y"
                        ]

df.drop(columns=unnecessary_features, inplace=True)
df.head()
df.columns

Unnamed: 0,scheduled_elapsed_time,status,month,day_of_week,weekend,snowy_month,is_fed_holiday,peak_hour,temp_x,app_temp_x,...,wind_spd_y,wind_gust_spd_y,weather_description_y,pres_y,slp_y,vis_y,snow_rate_y,percentage_x,busiest_time,percentage_y
0,76,early,January,Friday,0,1,1,1,0.0,-4.3,...,2.85,5.2,light rain,1026,1027,16,0.0,8.3,0,9.67
1,83,early,January,Saturday,1,1,0,0,0.6,-0.3,...,5.8,6.4,broken clouds,1017,1018,16,0.0,8.3,0,9.67
2,78,early,January,Wednesday,0,1,0,0,-0.1,-3.8,...,8.9,9.6,scattered clouds,1016,1017,16,0.0,8.3,0,9.67
3,78,early,January,Thursday,0,1,0,0,-2.4,-6.2,...,5.37,7.2,few clouds,1018,1019,16,0.0,8.3,0,9.67
4,78,early,January,Friday,0,1,0,0,-7.8,-12.6,...,7.57,8.4,overcast clouds,1011,1012,16,0.0,8.3,0,9.67


Index(['scheduled_elapsed_time', 'status', 'month', 'day_of_week', 'weekend',
       'snowy_month', 'is_fed_holiday', 'peak_hour', 'temp_x', 'app_temp_x',
       'clouds_x', 'precip_rate_x', 'rh_x', 'wind_spd_x', 'wind_gust_spd_x',
       'weather_description_x', 'pres_x', 'slp_x', 'vis_x', 'snow_rate_x',
       'temp_y', 'app_temp_y', 'clouds_y', 'precip_rate_y', 'rh_y',
       'wind_spd_y', 'wind_gust_spd_y', 'weather_description_y', 'pres_y',
       'slp_y', 'vis_y', 'snow_rate_y', 'percentage_x', 'busiest_time',
       'percentage_y'],
      dtype='object')

In [17]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

def get_ohe(df, col):
    if df[col].dtype == 'object':  # Check if the column contains string values
        # Ignores the new elements during testing / prediction
        ohe = OneHotEncoder(drop='first', handle_unknown='ignore',sparse_output=False, dtype='int')
        ohe.fit(df[[col]])
        temp_df = pd.DataFrame(data=ohe.transform(df[[col]]), columns=ohe.get_feature_names_out())
        df.drop(columns=[col], axis=1, inplace=True)
        df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
        return df, ohe

In [18]:
df.dtypes

scheduled_elapsed_time      int64
status                     object
month                      object
day_of_week                object
weekend                     int64
snowy_month                 int64
is_fed_holiday              int64
peak_hour                   int64
temp_x                    float64
app_temp_x                float64
clouds_x                    int64
precip_rate_x             float64
rh_x                        int64
wind_spd_x                float64
wind_gust_spd_x           float64
weather_description_x      object
pres_x                      int64
slp_x                       int64
vis_x                       int64
snow_rate_x               float64
temp_y                    float64
app_temp_y                float64
clouds_y                    int64
precip_rate_y             float64
rh_y                        int64
wind_spd_y                float64
wind_gust_spd_y           float64
weather_description_y      object
pres_y                      int64
slp_y         

In [19]:
# One hot encode the categorical variables
cols = ["weather_description_x", "weather_description_y", "day_of_week", "month"]

one_hots = {}

for col in cols:
    df, ohe = get_ohe(df, col)
    one_hots[col] = ohe

df.head()

Unnamed: 0,scheduled_elapsed_time,status,weekend,snowy_month,is_fed_holiday,peak_hour,temp_x,app_temp_x,clouds_x,precip_rate_x,...,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,76,early,0,1,1,1,0.0,-4.3,100,1.75,...,0,0,1,0,0,0,0,0,0,0
1,83,early,1,1,0,0,0.6,-0.3,100,0.0,...,0,0,1,0,0,0,0,0,0,0
2,78,early,0,1,0,0,-0.1,-3.8,100,0.0,...,0,0,1,0,0,0,0,0,0,0
3,78,early,0,1,0,0,-2.4,-6.2,100,0.0,...,0,0,1,0,0,0,0,0,0,0
4,78,early,0,1,0,0,-7.8,-12.6,43,0.0,...,0,0,1,0,0,0,0,0,0,0


In [20]:
# Split the data into train & test sets
X_train, X_test, y_train, y_test = train_test_split(
                            df.drop(['status'], axis=1), 
                            df['status'], 
                            test_size=0.20, 
                            random_state = 35,
                            stratify=df['status'])

In [21]:
original_cols = X_train.columns

# Data Standardization
## PCA

In [22]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 20, random_state=42)
X_train = pd.DataFrame(pca.fit_transform(X_train), index = X_train.index)
X_test = pd.DataFrame(pca.transform(X_test), index = X_test.index)

## Standard Scaler

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)

# Model training
## Bagging Classifier

In [24]:
bag = BaggingClassifier(random_state=50, n_estimators = 50, n_jobs = 10)

bag = bag.fit(X_train, y_train) 
bag.score(X_train, y_train) 

test_output = pd.DataFrame(bag.predict(X_test), index = X_test.index, columns = ['pred_Y'])

# test_output.head()
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
bag.score(X_test, y_test) 
sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

1.0

Unnamed: 0,pred_Y,status
6510,early,early
10504,late,late
9117,late,late
3830,early,early
5674,early,early


Fraction of correct classification 


0.9841619005719313

0.9841619005719313

In [25]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.9804341315396824

0.9838009355706823

0.9820227403205329

## Gradient Boosting Classifier

In [26]:
gb = GradientBoostingClassifier(random_state=50, min_samples_split = 12, 
                                min_samples_leaf = 6, max_depth = 4, 
                                n_estimators = 100, learning_rate=0.1)

gb = gb.fit(X_train, y_train) 
gb.score(X_train, y_train)

test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_Y'])

# test_output.head()
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
gb.score(X_test, y_test) 
sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

0.7989656690140845

Unnamed: 0,pred_Y,status
6510,early,early
10504,early,late
9117,late,late
3830,early,early
5674,early,early


Fraction of correct classification 


0.7351517817861857

0.7351517817861857

In [27]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.7923803031778491

0.5896635610164296

0.6257116283344623

## Grid Search for hyperparameter tuning

In [28]:
# from sklearn.model_selection import GridSearchCV

# # Define the grid of hyperparameters to search
# param_grid = {
#     # 'loss': ['log_loss', 'deviance', 'exponential'],
#     'min_samples_split': [5, 10, 15, 20],
#     'min_samples_leaf': [2, 5, 7, 10],
#     'max_depth': [3, 7, 9],
#     'learning_rate': [0.01, 0.05, 0.1, 0,3, 0.5]
# }

# # Create an MLPClassifier object
# clf = GradientBoostingClassifier(random_state=50)

# # Create a GridSearchCV object
# grid_search = GridSearchCV(clf, param_grid, cv=5)

# grid_search.fit(X_train, y_train)

# print(grid_search.best_params_)
# print(grid_search.best_score_)

## Randon Forest Classifier

In [29]:
rf = RandomForestClassifier(random_state=50, min_samples_split = 12, min_samples_leaf = 6, max_features = "sqrt", n_estimators = 100)

rf = rf.fit(X_train, y_train) 
rf.score(X_train, y_train) 

# rf.feature_importances_
# feat_imp = pd.Series(rf.feature_importances_, X_train.columns.values).sort_values(ascending=False)

# feat_imp_table = pd.DataFrame(feat_imp)
# feat_imp_table = feat_imp_table.reset_index()
# feat_imp_table.columns = ['Features', 'Values']
# feat_imp.plot(kind='bar', title='Feature Importances')
# plt.ylabel('Feature Importance Score')
# plt.figure(figsize=[40,20], dpi = 50)
# feat_imp.head(12)

test_output = pd.DataFrame(rf.predict(X_test), index = X_test.index, columns = ['pred_Y'])

# test_output.head()
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
rf.score(X_test, y_test) 
sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

0.9773327464788732

Unnamed: 0,pred_Y,status
6510,early,early
10504,late,late
9117,late,late
3830,early,early
5674,early,early


Fraction of correct classification 


0.8957325120985482

0.8957325120985482

In [30]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.9393969218861038

0.8348446996247803

0.8756060454709788

## Logistic Regression

In [31]:
lr_model = LogisticRegression(fit_intercept = True, solver='lbfgs', multi_class = 'auto', penalty = 'l2', max_iter = 1000)
lr_model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
lr_model.score(X_train, y_train) 

test_output = pd.DataFrame(lr_model.predict(X_test), index = X_test.index, columns = ['pred_Y'])

test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Percentage of correct predictions is ')
print(lr_model.score(X_test, y_test))
sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

0.6145466549295775

Unnamed: 0,pred_Y,status
6510,early,early
10504,early,late
9117,early,late
3830,early,early
5674,early,early


Percentage of correct predictions is 
0.612846458424989


0.612846458424989

In [32]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.49041794839727815

0.4148341663937389

0.38618870550070045

In [33]:
lr_model = LogisticRegression(fit_intercept = True, solver='newton-cg', multi_class = 'auto', penalty = 'l2', max_iter = 1000)
lr_model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
lr_model.score(X_train, y_train) 

test_output = pd.DataFrame(lr_model.predict(X_test), index = X_test.index, columns = ['pred_Y'])

test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Percentage of correct predictions is ')
print(lr_model.score(X_test, y_test))
sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

0.6145466549295775

Unnamed: 0,pred_Y,status
6510,early,early
10504,early,late
9117,early,late
3830,early,early
5674,early,early


Percentage of correct predictions is 
0.612846458424989


0.612846458424989

In [34]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.49041794839727815

0.4148341663937389

0.38618870550070045

In [35]:
lr_model = LogisticRegression(fit_intercept = True, solver='newton-cg', multi_class = 'multinomial', penalty = 'l2', max_iter = 1000)
lr_model.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
lr_model.score(X_train, y_train) 

test_output = pd.DataFrame(lr_model.predict(X_test), index = X_test.index, columns = ['pred_Y'])

test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Percentage of correct predictions is ')
print(lr_model.score(X_test, y_test))

0.6145466549295775

Unnamed: 0,pred_Y,status
6510,early,early
10504,early,late
9117,early,late
3830,early,early
5674,early,early


Percentage of correct predictions is 
0.612846458424989


In [36]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.49041794839727815

0.4148341663937389

0.38618870550070045

## XGBoost

In [37]:
import xgboost as xgb

In [38]:
y_train.astype("category").dtypes

CategoricalDtype(categories=['early', 'late', 'on_time'], ordered=False, categories_dtype=object)

In [39]:
y_train.unique()

array(['late', 'early', 'on_time'], dtype=object)

In [40]:
y_train_xgb = y_train.map({"early": 0, "on_time": 1, "late": 2})
y_test_xgb = y_test.map({"early": 0, "on_time": 1, "late": 2})

In [41]:
dtrain_class = xgb.DMatrix(X_train, y_train_xgb, enable_categorical=True)

dtest_class = xgb.DMatrix(X_test, y_test_xgb, enable_categorical=True)

In [42]:
params = {
    "objective": "multi:softmax", 
    "tree_method": "exact", 
    "max_depth" : 10, 
    "learning_rate" : 0.4,
    "num_class": 3,
    "max_leaves": 4
    } # use "tree_method" : "hist" if you need speed

n = 100

model = xgb.train(
   params = params,
   dtrain = dtrain_class,
   num_boost_round = n,
)

In [43]:
test_output = pd.DataFrame(model.predict(dtest_class), index = X_test.index, columns = ['pred_Y'])

test_output = test_output.merge(y_test_xgb, left_index = True, right_index = True)
test_output.head()

sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

Unnamed: 0,pred_Y,status
6510,0.0,0
10504,2.0,2
9117,2.0,2
3830,0.0,0
5674,0.0,0


0.9854817421909371

In [44]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.9833721423191834

0.9832219665061573

0.9832902514538499

## Tuning - XGBoost

In [45]:
# learning_rate_range = np.arange(0.01, 1, 0.05)
# test_XG = [] 
# train_XG = []
# for lr in learning_rate_range:
#     xgb_classifier = xgb.XGBClassifier(eta = lr)
#     xgb_classifier.fit(X_train, y_train_xgb)
#     train_XG.append(xgb_classifier.score(X_train, y_train_xgb))
#     test_XG.append(xgb_classifier.score(X_test, y_test_xgb))

In [46]:
# max(test_XG)

# Load predictions file

In [47]:
pred_df = pd.read_csv("prediction_part_1.csv")
pred_df.head()

Unnamed: 0,origin_airport,scheduled_departure_time,status,arrival_status_prev_flight_early,arrival_status_prev_flight_ontime,arrival_status_prev_flight_late,scheduled_elapsed_time,scheduled_arrival_time_period,peak_hour,month,...,precip_rate_y,rh_y,wind_spd_y,wind_gust_spd_y,wind_dir_y,weather_description_y,pres_y,slp_y,vis_y,snow_rate_y
0,MCO,2024-04-19 11:35:00,,,,,165,afternoon,0,April,...,0.0,50,2.0,2.8,290,few clouds,1014.0,1016.7,24.0,0
1,JFK,2024-04-19 13:34:00,,,,,77,afternoon,0,April,...,0.0,61,5.6,7.4,120,overcast clouds,1020.0,1020.1,24.0,0
2,JFK,2024-04-19 14:55:00,,,,,86,evening,1,April,...,0.0,63,5.2,7.3,130,overcast clouds,1020.0,1020.1,24.0,0
3,MCO,2024-04-19 13:35:00,,,,,170,evening,1,April,...,0.0,40,2.8,4.2,280,few clouds,1012.5,1015.2,24.0,0
4,ORD,2024-04-19 18:52:00,,,,,175,night,0,April,...,0.0,38,7.2,11.0,280,broken clouds,996.5,1021.1,24.0,0


## Data Cleaning - Prediction

In [48]:
cols

['weather_description_x', 'weather_description_y', 'day_of_week', 'month']

In [49]:
pred_df.columns

Index(['origin_airport', 'scheduled_departure_time', 'status',
       'arrival_status_prev_flight_early', 'arrival_status_prev_flight_ontime',
       'arrival_status_prev_flight_late', 'scheduled_elapsed_time',
       'scheduled_arrival_time_period', 'peak_hour', 'month', 'day_of_week',
       'weekend', 'snowy_month', 'is_fed_holiday', 'carrier_code',
       'datetime_x', 'temp_x', 'app_temp_x', 'clouds_x', 'precip_rate_x',
       'rh_x', 'wind_spd_x', 'wind_gust_spd_x', 'wind_dir_x',
       'weather_description_x', 'pres_x', 'slp_x', 'vis_x', 'snow_rate_x',
       'datetime_y', 'temp_y', 'app_temp_y', 'clouds_y', 'precip_rate_y',
       'rh_y', 'wind_spd_y', 'wind_gust_spd_y', 'wind_dir_y',
       'weather_description_y', 'pres_y', 'slp_y', 'vis_y', 'snow_rate_y'],
      dtype='object')

In [50]:
# One hot encoding, using the previously fit encoders
for col in cols:
    ohe = one_hots[col]
    pred_df[ohe.get_feature_names_out()] = ohe.transform(pred_df[[col]])
    pred_df.drop(columns=[col], inplace=True)



In [51]:
pred_df

Unnamed: 0,origin_airport,scheduled_departure_time,status,arrival_status_prev_flight_early,arrival_status_prev_flight_ontime,arrival_status_prev_flight_late,scheduled_elapsed_time,scheduled_arrival_time_period,peak_hour,weekend,...,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,MCO,2024-04-19 11:35:00,,,,,165,afternoon,0,0,...,0,0,0,0,0,0,0,0,0,0
1,JFK,2024-04-19 13:34:00,,,,,77,afternoon,0,0,...,0,0,0,0,0,0,0,0,0,0
2,JFK,2024-04-19 14:55:00,,,,,86,evening,1,0,...,0,0,0,0,0,0,0,0,0,0
3,MCO,2024-04-19 13:35:00,,,,,170,evening,1,0,...,0,0,0,0,0,0,0,0,0,0
4,ORD,2024-04-19 18:52:00,,,,,175,night,0,0,...,0,0,0,0,0,0,0,0,0,0
5,ORD,2024-04-19 19:59:00,,,,,173,night,1,0,...,0,0,0,0,0,0,0,0,0,0
6,JFK,2024-04-20 13:25:00,,,,,76,afternoon,0,1,...,0,0,0,0,0,0,0,0,0,0
7,JFK,2024-04-20 14:55:00,,,,,86,evening,1,1,...,0,0,0,0,0,0,0,0,0,0
8,MCO,2024-04-20 13:35:00,,,,,170,evening,1,1,...,0,0,0,0,0,0,0,0,0,0
9,ORD,2024-04-20 18:52:00,,,,,175,night,0,1,...,0,0,0,0,0,0,0,0,0,0


In [52]:
pred_df["carrier_code"].unique()

array(['WN', 'B6', 'DL', 'UA', 'AA'], dtype=object)

In [53]:
df_stats_carrier["carrier_code"].unique()

array(['9E', 'AA', 'B6', 'DL', 'EV', 'F9', 'G4', 'MQ', 'OH', 'OO', 'UA',
       'WN', 'YV', 'YX'], dtype=object)

In [54]:
# Merge Stats
pred_df = pd.merge(pred_df, df_stats_airport, on=["origin_airport"], how="inner")

pred_df = pd.merge(pred_df, df_stats_carrier, on=["carrier_code"], how="inner")

pred_df.head()

Unnamed: 0,origin_airport,scheduled_departure_time,status,arrival_status_prev_flight_early,arrival_status_prev_flight_ontime,arrival_status_prev_flight_late,scheduled_elapsed_time,scheduled_arrival_time_period,peak_hour,weekend,...,month_July,month_June,month_March,month_May,month_November,month_October,month_September,percentage_x,busiest_time,percentage_y
0,MCO,2024-04-19 11:35:00,,,,,165,afternoon,0,0,...,0,0,0,0,0,0,0,15.52,15:20:00,25.77
1,MCO,2024-04-21 11:05:00,,,,,165,afternoon,0,1,...,0,0,0,0,0,0,0,15.52,15:20:00,25.77
2,MCO,2024-04-22 11:35:00,,,,,165,afternoon,0,0,...,0,0,0,0,0,0,0,15.52,15:20:00,25.77
3,MCO,2024-04-19 13:35:00,,,,,170,evening,1,0,...,0,0,0,0,0,0,0,15.52,15:20:00,27.2
4,MCO,2024-04-20 13:35:00,,,,,170,evening,1,1,...,0,0,0,0,0,0,0,15.52,15:20:00,27.2


In [55]:
# Drop unnecessary columns
pred_df.drop(
    columns=["origin_airport", "status", "arrival_status_prev_flight_early", "arrival_status_prev_flight_ontime", 
             "arrival_status_prev_flight_late"], 
    inplace=True)

In [56]:
# Check busiest time
pred_df["busiest_time"] = abs(pd.to_datetime(pred_df["scheduled_departure_time"]) - pd.to_datetime(pred_df["busiest_time"])) <= pd.to_timedelta(0.25, unit="hour")
pred_df["busiest_time"] = pred_df["busiest_time"].astype(int)
pred_df.drop(columns=["scheduled_departure_time"], inplace=True)
pred_df.head()

  pred_df["busiest_time"] = abs(pd.to_datetime(pred_df["scheduled_departure_time"]) - pd.to_datetime(pred_df["busiest_time"])) <= pd.to_timedelta(0.25, unit="hour")


Unnamed: 0,scheduled_elapsed_time,scheduled_arrival_time_period,peak_hour,weekend,snowy_month,is_fed_holiday,carrier_code,datetime_x,temp_x,app_temp_x,...,month_July,month_June,month_March,month_May,month_November,month_October,month_September,percentage_x,busiest_time,percentage_y
0,165,afternoon,0,0,0,0,WN,2024-04-19 14:00:00,16.3,16.3,...,0,0,0,0,0,0,0,15.52,0,25.77
1,165,afternoon,0,1,0,0,WN,2024-04-21 14:00:00,10.8,10.8,...,0,0,0,0,0,0,0,15.52,0,25.77
2,165,afternoon,0,0,0,0,WN,2024-04-22 14:00:00,13.4,13.4,...,0,0,0,0,0,0,0,15.52,0,25.77
3,170,evening,1,0,0,0,B6,2024-04-19 16:00:00,15.9,15.9,...,0,0,0,0,0,0,0,15.52,0,27.2
4,170,evening,1,1,0,0,B6,2024-04-20 16:00:00,10.3,10.3,...,0,0,0,0,0,0,0,15.52,0,27.2


In [57]:
pred_df.columns.to_list()

['scheduled_elapsed_time',
 'scheduled_arrival_time_period',
 'peak_hour',
 'weekend',
 'snowy_month',
 'is_fed_holiday',
 'carrier_code',
 'datetime_x',
 'temp_x',
 'app_temp_x',
 'clouds_x',
 'precip_rate_x',
 'rh_x',
 'wind_spd_x',
 'wind_gust_spd_x',
 'wind_dir_x',
 'pres_x',
 'slp_x',
 'vis_x',
 'snow_rate_x',
 'datetime_y',
 'temp_y',
 'app_temp_y',
 'clouds_y',
 'precip_rate_y',
 'rh_y',
 'wind_spd_y',
 'wind_gust_spd_y',
 'wind_dir_y',
 'pres_y',
 'slp_y',
 'vis_y',
 'snow_rate_y',
 'weather_description_x_clear_sky',
 'weather_description_x_few_clouds',
 'weather_description_x_flurries',
 'weather_description_x_fog',
 'weather_description_x_freezing_rain',
 'weather_description_x_haze',
 'weather_description_x_heavy_rain',
 'weather_description_x_heavy_snow',
 'weather_description_x_light_rain',
 'weather_description_x_light_snow',
 'weather_description_x_mix_snow_rain',
 'weather_description_x_moderate_rain',
 'weather_description_x_overcast_clouds',
 'weather_description_x_sc

In [58]:
original_cols

Index(['scheduled_elapsed_time', 'weekend', 'snowy_month', 'is_fed_holiday',
       'peak_hour', 'temp_x', 'app_temp_x', 'clouds_x', 'precip_rate_x',
       'rh_x', 'wind_spd_x', 'wind_gust_spd_x', 'pres_x', 'slp_x', 'vis_x',
       'snow_rate_x', 'temp_y', 'app_temp_y', 'clouds_y', 'precip_rate_y',
       'rh_y', 'wind_spd_y', 'wind_gust_spd_y', 'pres_y', 'slp_y', 'vis_y',
       'snow_rate_y', 'percentage_x', 'busiest_time', 'percentage_y',
       'weather_description_x_clear_sky', 'weather_description_x_few_clouds',
       'weather_description_x_flurries', 'weather_description_x_fog',
       'weather_description_x_freezing_rain', 'weather_description_x_haze',
       'weather_description_x_heavy_rain', 'weather_description_x_heavy_snow',
       'weather_description_x_light_rain', 'weather_description_x_light_snow',
       'weather_description_x_mix_snow_rain',
       'weather_description_x_moderate_rain',
       'weather_description_x_overcast_clouds',
       'weather_description_x_s

In [59]:
# Check if any of the original training features were missing
for col in original_cols:
    if col not in pred_df.columns:
        print(col)

In [60]:
# Drop if any columns are extra here in predictions that aren't in the training features
extra_cols = list(set(pred_df.columns) - set(original_cols))
extra_cols

pred_df.drop(columns=extra_cols, inplace=True)

['wind_dir_x',
 'carrier_code',
 'scheduled_arrival_time_period',
 'datetime_y',
 'wind_dir_y',
 'datetime_x']

In [61]:
# Reorder the columns
pred_df = pred_df[original_cols]
pred_df.head()

Unnamed: 0,scheduled_elapsed_time,weekend,snowy_month,is_fed_holiday,peak_hour,temp_x,app_temp_x,clouds_x,precip_rate_x,rh_x,...,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,165,0,0,0,0,16.3,16.3,84,0.5,55,...,0,0,0,0,0,0,0,0,0,0
1,165,1,0,0,0,10.8,10.8,46,0.0,38,...,0,0,0,0,0,0,0,0,0,0
2,165,0,0,0,0,13.4,13.4,13,0.0,34,...,0,0,0,0,0,0,0,0,0,0
3,170,0,0,0,1,15.9,15.9,84,0.76,58,...,0,0,0,0,0,0,0,0,0,0
4,170,1,0,0,1,10.3,10.3,63,0.0,41,...,0,0,0,0,0,0,0,0,0,0


In [62]:
# PCA
x = pca.transform(pred_df.values)
# StandardScaler
x = sc.transform(x)



## Predictions - XGBoost

Since XGBoost gave better testing accuracy among all models, we can check the predictions using it. 
We also have better precision, recall, f1-score among all models for this.

In [63]:
d = {0: "early", 1: "on_time", 2: "late"}

for i, v in enumerate(model.inplace_predict(x)):
    # if i%2==0:
    print(i+2, d[v])

2 early
3 early
4 early
5 early
6 early
7 early
8 early
9 early
10 late
11 on_time
12 on_time
13 early
14 early
15 early
16 early
17 early
18 early
19 early
20 early
21 early
22 early
23 early
24 early


## Predictions - Gradient Boost, retrain with full data

In [65]:
gb = GradientBoostingClassifier(random_state=50, min_samples_split = 12, 
                                min_samples_leaf = 6, max_depth = 4, 
                                n_estimators = 100, learning_rate=0.1)

gb = gb.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test])) 
gb.score(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_Y'])

# test_output.head()
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
gb.score(X_test, y_test) 
sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

0.7932400316873515

Unnamed: 0,pred_Y,status
6510,early,early
10504,early,late
9117,late,late
3830,early,early
5674,early,early


Fraction of correct classification 


0.7861856577210735

0.7861856577210735

In [66]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.8705313612521208

0.6579405102589587

0.706948943257105

In [67]:
for i, v in enumerate(gb.predict(x)):
    if i%2==0:
        print(i+2, v)

2 early
4 early
6 early
8 late
10 early
12 early
14 early
16 early
18 early
20 late
22 early
24 late


## Predictions - Random Forest, retrain with full data

In [68]:
rf = RandomForestClassifier(random_state=50, min_samples_split = 12, min_samples_leaf = 6, max_features = "sqrt", n_estimators = 100)
rf = rf.fit(pd.concat([X_train, X_test]), pd.concat([y_train, y_test])) 
rf.score(pd.concat([X_train, X_test]), pd.concat([y_train, y_test]))

test_output = pd.DataFrame(rf.predict(X_test), index = X_test.index, columns = ['pred_Y'])

# test_output.head()
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
rf.score(X_test, y_test) 
sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

0.998415632426723

Unnamed: 0,pred_Y,status
6510,early,early
10504,late,late
9117,late,late
3830,early,early
5674,early,early


Fraction of correct classification 


0.7861856577210735

0.9991201055873296

In [69]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

0.9995035989079176

0.9985152190051968

0.9990075668866653

In [70]:
for i, v in enumerate(rf.predict(x)):
    # if i%2==0:
    print(i+2, v)

2 early
3 early
4 early
5 early
6 early
7 early
8 early
9 early
10 early
11 early
12 early
13 early
14 early
15 early
16 early
17 early
18 early
19 early
20 early
21 early
22 early
23 early
24 early


## Predictions - XGBoost, retrain with full data

In [71]:
y_xgb = pd.concat([y_train, y_test]).map({"early": 0, "on_time": 1, "late": 2})

d_class = xgb.DMatrix(pd.concat([X_train, X_test]), y_xgb, enable_categorical=True)

params = {
    "objective": "multi:softmax", 
    "tree_method": "exact", 
    "max_depth" : 10, 
    "learning_rate" : 0.4,
    "num_class": 3,
    "max_leaves": 4
    } # use "tree_method" : "hist" if you need speed

n = 100

model = xgb.train(
   params = params,
   dtrain = d_class,
   num_boost_round = n,
)

test_output = pd.DataFrame(model.predict(d_class), index = pd.concat([X_train, X_test]).index, columns = ['pred_Y'])

test_output = test_output.merge(y_xgb, left_index = True, right_index = True)
test_output.head()

sum(test_output["pred_Y"] == test_output["status"])/len(test_output)

Unnamed: 0,pred_Y,status
8106,2.0,2
6990,0.0,0
53,0.0,0
8696,1.0,1
11080,0.0,0


1.0

In [72]:
precision_score(test_output["status"], test_output["pred_Y"], average="macro")
recall_score(test_output["status"], test_output["pred_Y"], average="macro")
f1_score(test_output["status"], test_output["pred_Y"], average="macro")

1.0

1.0

1.0

In [73]:
d = {0: "early", 1: "on_time", 2: "late"}

for i, v in enumerate(model.inplace_predict(x)):
    # if i%2==0:
    print(i+2, d[v])

2 early
3 early
4 early
5 early
6 early
7 early
8 early
9 on_time
10 early
11 early
12 early
13 early
14 early
15 early
16 early
17 early
18 early
19 early
20 early
21 early
22 early
23 early
24 early


We'll consider XGBoost as it has highest possible metrics.