In [1]:
import pandas as pd
import xgboost as xgb
# import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

from modules.smoothed_target_encoder import SmoothedTargetEncoder
from modules.cyclical_encoder import CyclicalEncoder
from modules.flight_preprocessor import FlightPreprocessor

In [2]:
df_flights = pd.read_csv("./data/flights.csv", low_memory=False)

# Set the columns to lower case
df_flights.columns = df_flights.columns.str.lower()
df_flights = df_flights.fillna(0)
df_flights['is_delayed'] = (df_flights['arrival_delay'] > 15).astype(int)

In [3]:
df_airports = pd.read_csv("./data/airports.csv", low_memory=False)
df_airports.columns = df_airports.columns.str.lower()
df_airports = df_airports[['iata_code', 'state']]

In [4]:
processor = FlightPreprocessor()
df_flights = processor.preprocess(df_flights)

In [5]:
df_flights = df_flights.merge(
    df_airports,
    how='left',
    left_on='origin_airport',
    right_on='iata_code',
    suffixes=('', '_origin')
)

df_flights = df_flights.rename(columns={'state': 'state_origin'})
df_flights = df_flights.drop(columns=['iata_code'])

df_flights = df_flights.merge(
    df_airports,
    how='left',
    left_on='destination_airport',
    right_on='iata_code',
    suffixes=('', '_dest')
)

df_flights = df_flights.rename(columns={'state': 'state_dest'})
df_flights = df_flights.drop(columns=['iata_code'])

In [6]:
#variable that we know before the flight 
known_variables = [
    'airline', 'origin_airport', 'destination_airport', 
    'state_origin', 'state_dest',
    'scheduled_departure', 'scheduled_arrival',
    'late_aircraft_delay', 'distance', 
    'is_delayed'
]
df_features = df_flights[known_variables].copy()

In [7]:
df_features.head()

Unnamed: 0,airline,origin_airport,destination_airport,state_origin,state_dest,scheduled_departure,scheduled_arrival,late_aircraft_delay,distance,is_delayed
0,AS,ANC,SEA,AK,WA,2015-01-01 00:05:00,2015-01-01 04:30:00,0.0,1448,0
1,AA,LAX,PBI,CA,FL,2015-01-01 00:10:00,2015-01-01 07:50:00,0.0,2330,0
2,US,SFO,CLT,CA,NC,2015-01-01 00:20:00,2015-01-01 08:06:00,0.0,2296,0
3,AA,LAX,MIA,CA,FL,2015-01-01 00:20:00,2015-01-01 08:05:00,0.0,2342,0
4,AS,SEA,ANC,WA,AK,2015-01-01 00:25:00,2015-01-01 03:20:00,0.0,1448,0


In [9]:
df_features['distance_bin'] = pd.qcut(
    df_features['distance'],
    q=10,
    duplicates='drop'
)

enc_distance = SmoothedTargetEncoder('distance', 'is_delayed')
enc_origin = SmoothedTargetEncoder('origin_airport', 'is_delayed')
enc_destination = SmoothedTargetEncoder('destination_airport', 'is_delayed')
enc_airline = SmoothedTargetEncoder('airline', 'is_delayed')
enc_state_origin = SmoothedTargetEncoder('state_origin', 'is_delayed')
enc_state_dest = SmoothedTargetEncoder('state_dest', 'is_delayed')

df_features['distance'] = enc_distance.fit_transform(df_features)
df_features['origin_airport_encoded'] = enc_origin.fit_transform(df_features)
df_features['destination_airport_encoded'] = enc_destination.fit_transform(df_features)
df_features['airline_encoded'] = enc_airline.fit_transform(df_features)
df_features['state_origin'] = enc_state_origin.fit_transform(df_features)
df_features['state_dest'] = enc_state_dest.fit_transform(df_features)

In [12]:
df_features['scheduled_departure'] = pd.to_datetime(df_features['scheduled_departure'])
df_features['scheduled_arrival']   = pd.to_datetime(df_features['scheduled_arrival'])


# departure
df_features['dep_hour'] = (
    df_features['scheduled_departure'].dt.hour +
    df_features['scheduled_departure'].dt.minute / 60
)
df_features['dep_dayofyear'] = df_features['scheduled_departure'].dt.dayofyear

#arrival
df_features['arr_hour'] = (
    df_features['scheduled_arrival'].dt.hour +
    df_features['scheduled_arrival'].dt.minute / 60
)
df_features['arr_dayofyear'] = df_features['scheduled_arrival'].dt.dayofyear


dep_enc_day = CyclicalEncoder('dep_dayofyear', period=365)
dep_enc_hour = CyclicalEncoder('dep_hour', period=24)
arr_enc_day = CyclicalEncoder('arr_dayofyear', period=365)
arr_enc_hour = CyclicalEncoder('arr_hour', period=24)

df_features = dep_enc_day.fit_transform(df_features)
df_features = dep_enc_hour.fit_transform(df_features)
df_features = arr_enc_day.fit_transform(df_features)
df_features = arr_enc_hour.fit_transform(df_features)

In [19]:
df_features.columns

Index(['airline', 'origin_airport', 'destination_airport', 'state_origin',
       'state_dest', 'scheduled_departure', 'scheduled_arrival',
       'late_aircraft_delay', 'distance', 'is_delayed', 'distance_bin',
       'origin_airport_encoded', 'destination_airport_encoded',
       'airline_encoded', 'dep_dayofyear_sin', 'dep_dayofyear_cos',
       'dep_hour_sin', 'dep_hour_cos', 'arr_dayofyear_sin',
       'arr_dayofyear_cos', 'arr_hour_sin', 'arr_hour_cos'],
      dtype='object')

In [None]:
#check late_aircraft_delay latter

In [21]:
final_features = [
    'state_origin', 'state_dest', 'distance',
    'origin_airport_encoded', 'destination_airport_encoded', 'airline_encoded', 
    'dep_dayofyear_sin', 'dep_dayofyear_cos',
    'dep_hour_sin', 'dep_hour_cos', 
    'arr_dayofyear_sin', 'arr_dayofyear_cos', 
    'arr_hour_sin', 'arr_hour_cos',
    'is_delayed'
]

df_processed = df_features[final_features].copy()

In [22]:
df_processed.head()

Unnamed: 0,state_origin,state_dest,distance,origin_airport_encoded,destination_airport_encoded,airline_encoded,dep_dayofyear_sin,dep_dayofyear_cos,dep_hour_sin,dep_hour_cos,arr_dayofyear_sin,arr_dayofyear_cos,arr_hour_sin,arr_hour_cos,is_delayed
0,0.137861,0.156151,0.136895,0.117408,0.155344,0.122971,0.017213,0.999852,0.021815,0.999762,0.017213,0.999852,0.92388,0.382683,0
1,0.182288,0.193354,0.217552,0.19861,0.216695,0.172742,0.017213,0.999852,0.043619,0.999048,0.017213,0.999852,0.887011,-0.461749,0
2,0.181923,0.161174,0.171443,0.192121,0.148261,0.175863,0.017213,0.999852,0.087156,0.996195,0.017213,0.999852,0.85264,-0.522499,0
3,0.181923,0.193603,0.208143,0.197842,0.196137,0.172308,0.017213,0.999852,0.087156,0.996195,0.017213,0.999852,0.854912,-0.518773,0
4,0.1559,0.156529,0.134921,0.159041,0.166936,0.123137,0.017213,0.999852,0.108867,0.994056,0.017213,0.999852,0.766044,0.642788,0


In [25]:
df_processed['is_delayed'].value_counts(normalize=True)

is_delayed
0    0.824113
1    0.175887
Name: proportion, dtype: float64

In [26]:
X = df_processed.drop(columns=['is_delayed'])
y = df_processed['is_delayed']

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [28]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [29]:
feature_names = X_train.columns.tolist()
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

In [33]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss', 
#     scale_pos_weight=2,
    n_estimators=1000,
    learning_rate=0.01,
    subsample=0.6,
    max_depth=5,
    random_state=42
)

In [34]:
xgb_model.fit(X_train_scaled_df, y_train)

In [32]:
# Predict probabilities for AUC calculation
y_pred_proba = xgb_model.predict_proba(X_test_scaled_df)[:, 1]

# Predict class labels
y_pred = xgb_model.predict(X_test_scaled_df)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba)
cm = confusion_matrix(y_test, y_pred)

print("\n### Model Evaluation Results (Test Set) ###")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {auc_score:.4f}")
print("\nConfusion Matrix:")
print(cm)


### Model Evaluation Results (Test Set) ###
Accuracy: 0.8167
ROC-AUC Score: 0.6937

Confusion Matrix:
[[918099  41017]
 [172277  32423]]
