In [None]:
import pandas as pd
import xgboost as xgb
# import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

from modules.smoothed_target_encoder import SmoothedTargetEncoder
from modules.cyclical_encoder import CyclicalEncoder
from modules.flight_preprocessor import FlightPreprocessor

In [None]:
df_flights = pd.read_csv("./data/flights.csv", low_memory=False)

# Set the columns to lower case
df_flights.columns = df_flights.columns.str.lower()
#df_flights = df_flights.fillna(0)
df_flights['is_delayed'] = (df_flights['arrival_delay'] > 15).astype(int)

In [None]:
df_airports = pd.read_csv("./data/airports.csv", low_memory=False)
df_airports.columns = df_airports.columns.str.lower()
df_airports = df_airports[['iata_code', 'state']]

In [None]:
processor = FlightPreprocessor()
df_flights = processor.preprocess(df_flights)

In [None]:
df_flights = df_flights.merge(
    df_airports,
    how='left',
    left_on='origin_airport',
    right_on='iata_code',
    suffixes=('', '_origin')
)

df_flights = df_flights.rename(columns={'state': 'state_origin'})
df_flights = df_flights.drop(columns=['iata_code'])

df_flights = df_flights.merge(
    df_airports,
    how='left',
    left_on='destination_airport',
    right_on='iata_code',
    suffixes=('', '_dest')
)

df_flights = df_flights.rename(columns={'state': 'state_dest'})
df_flights = df_flights.drop(columns=['iata_code'])

In [None]:
#variable that we know before the flight 
known_variables = [
    'airline', 'origin_airport', 'destination_airport',
    'state_origin', 'state_dest',
    'distance',
    'scheduled_departure', 'scheduled_arrival',
    'day_of_week', 'month', 'year',
    'is_delayed'
]
df_features = df_flights[known_variables].copy()

In [None]:
df_features.head()

In [None]:
from sklearn.model_selection import train_test_split

df_features['distance'] = df_features['distance'].fillna(df_features['distance'].median())
for c in ['state_origin', 'state_dest', 'airline', 'origin_airport', 'destination_airport']:
    df_features[c] = df_features[c].fillna('UNK')

df_features['distance_bin'] = pd.qcut(
    df_features['distance'],
    q=10,
    duplicates='drop'
)

X_raw = df_features.drop(columns=['is_delayed'])
y = df_features['is_delayed']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X_raw, y, test_size=0.2, random_state=42, stratify=y
)

encoders = {
    'distance': SmoothedTargetEncoder('distance', 'is_delayed', m=20, n_splits=5),
    'origin_airport': SmoothedTargetEncoder('origin_airport', 'is_delayed', m=20, n_splits=5),
    'destination_airport': SmoothedTargetEncoder('destination_airport', 'is_delayed', m=20, n_splits=5),
    'airline': SmoothedTargetEncoder('airline', 'is_delayed', m=20, n_splits=5),
    'state_origin': SmoothedTargetEncoder('state_origin', 'is_delayed', m=20, n_splits=5),
    'state_dest': SmoothedTargetEncoder('state_dest', 'is_delayed', m=20, n_splits=5),
}

def apply_encoders(df_raw, target, fit=False):
    df = df_raw.copy()
    df['is_delayed'] = target.values
    for col, enc in encoders.items():
        if fit:
            df[f'{col}_enc'] = enc.fit_transform(df)
        else:
            df[f'{col}_enc'] = enc.transform(df)
    return df.drop(columns=['is_delayed'])

X_train_enc = apply_encoders(X_train_raw, y_train, fit=True)
X_test_enc  = apply_encoders(X_test_raw,  y_test,  fit=False)

for df in (X_train_enc, X_test_enc):
    df['scheduled_departure'] = pd.to_datetime(df['scheduled_departure'])
    df['scheduled_arrival']   = pd.to_datetime(df['scheduled_arrival'])
    df['dep_hour'] = df['scheduled_departure'].dt.hour + df['scheduled_departure'].dt.minute / 60
    df['dep_dayofyear'] = df['scheduled_departure'].dt.dayofyear
    df['arr_hour'] = df['scheduled_arrival'].dt.hour + df['scheduled_arrival'].dt.minute / 60
    df['arr_dayofyear'] = df['scheduled_arrival'].dt.dayofyear

for enc, period in [('dep_dayofyear',365),('dep_hour',24),('arr_dayofyear',365),('arr_hour',24)]:
    cyc = CyclicalEncoder(enc, period)
    X_train_enc = cyc.fit_transform(X_train_enc)
    X_test_enc  = cyc.transform(X_test_enc)

final_features = [
    'distance_enc', 'origin_airport_enc', 'destination_airport_enc', 'airline_enc',
    'state_origin_enc', 'state_dest_enc',
    'dep_dayofyear_sin', 'dep_dayofyear_cos',
    'dep_hour_sin', 'dep_hour_cos',
    'arr_dayofyear_sin', 'arr_dayofyear_cos',
    'arr_hour_sin', 'arr_hour_cos',
    'day_of_week', 'month', 'year'
]

X_train = X_train_enc[final_features].copy()
X_test  = X_test_enc[final_features].copy()


In [None]:
"""enc_distance = SmoothedTargetEncoder('distance', 'is_delayed')
enc_origin = SmoothedTargetEncoder('origin_airport', 'is_delayed')
enc_destination = SmoothedTargetEncoder('destination_airport', 'is_delayed')
enc_airline = SmoothedTargetEncoder('airline', 'is_delayed')
enc_state_origin = SmoothedTargetEncoder('state_origin', 'is_delayed')
enc_state_dest = SmoothedTargetEncoder('state_dest', 'is_delayed')

df_features['distance'] = enc_distance.fit_transform(df_features)
df_features['origin_airport_encoded'] = enc_origin.fit_transform(df_features)
df_features['destination_airport_encoded'] = enc_destination.fit_transform(df_features)
df_features['airline_encoded'] = enc_airline.fit_transform(df_features)
df_features['state_origin'] = enc_state_origin.fit_transform(df_features)
df_features['state_dest'] = enc_state_dest.fit_transform(df_features)"""

In [None]:
df_features['scheduled_departure'] = pd.to_datetime(df_features['scheduled_departure'])
df_features['scheduled_arrival']   = pd.to_datetime(df_features['scheduled_arrival'])


# departure
df_features['dep_hour'] = (
    df_features['scheduled_departure'].dt.hour +
    df_features['scheduled_departure'].dt.minute / 60
)
df_features['dep_dayofyear'] = df_features['scheduled_departure'].dt.dayofyear

#arrival
df_features['arr_hour'] = (
    df_features['scheduled_arrival'].dt.hour +
    df_features['scheduled_arrival'].dt.minute / 60
)
df_features['arr_dayofyear'] = df_features['scheduled_arrival'].dt.dayofyear


dep_enc_day = CyclicalEncoder('dep_dayofyear', period=365)
dep_enc_hour = CyclicalEncoder('dep_hour', period=24)
arr_enc_day = CyclicalEncoder('arr_dayofyear', period=365)
arr_enc_hour = CyclicalEncoder('arr_hour', period=24)

df_features = dep_enc_day.fit_transform(df_features)
df_features = dep_enc_hour.fit_transform(df_features)
df_features = arr_enc_day.fit_transform(df_features)
df_features = arr_enc_hour.fit_transform(df_features)

In [None]:
df_features.columns

In [None]:
#check late_aircraft_delay latter

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# split de validação interno

X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

scaler = RobustScaler()
X_tr_s   = scaler.fit_transform(X_tr)
X_val_s  = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=2000,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=(len(y_tr)-y_tr.sum())/y_tr.sum(),
    eval_metric='auc',
    #early_stopping_rounds=100,
    random_state=42
)

xgb_model.fit(
    X_tr_s, y_tr,
    eval_set=[(X_tr_s, y_tr), (X_val_s, y_val)],
    verbose=50,
    #early_stopping_rounds=100
)

y_val_proba = xgb_model.predict_proba(X_val_s)[:,1]
y_test_proba = xgb_model.predict_proba(X_test_s)[:,1]

print("Best iteration:", xgb_model.best_iteration)
print("Best score:", xgb_model.best_score)
print("Val AUC:", roc_auc_score(y_val, y_val_proba))
print("Test AUC:", roc_auc_score(y_test, y_test_proba))

"""
# split de validação interno
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=42
)

# balanceamento
pos = float(y_tr.sum())
neg = float(len(y_tr) - y_tr.sum())
scale_pos_weight = neg / pos

xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    n_estimators=5000,
    learning_rate=0.03,
    max_depth=5,
    min_child_weight=5,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=scale_pos_weight,
    eval_metric="auc",
    early_stopping_rounds=100,
    tree_method="hist",
    n_jobs=-1,
    random_state=42
)

xgb_model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=50
)

# (na interface sklearn, se treinou com early stopping, ele usa best_iteration automaticamente no predict/predict_proba)
y_val_proba  = xgb_model.predict_proba(X_val)[:, 1]
y_test_proba = xgb_model.predict_proba(X_test)[:, 1]

print("Best iteration:", xgb_model.best_iteration)
print("Best score:", xgb_model.best_score)
print("Val AUC:", roc_auc_score(y_val, y_val_proba))
print("Test AUC:", roc_auc_score(y_test, y_test_proba))"""

In [None]:
final_features = [
    'state_origin', 'state_dest', 'distance',
    'origin_airport_encoded', 'destination_airport_encoded', 'airline_encoded', 
    'dep_dayofyear_sin', 'dep_dayofyear_cos',
    'dep_hour_sin', 'dep_hour_cos', 
    'arr_dayofyear_sin', 'arr_dayofyear_cos', 
    'arr_hour_sin', 'arr_hour_cos',
    'is_delayed'
]

df_processed = df_features[final_features].copy()

In [None]:
df_processed.head()

In [None]:
df_processed['is_delayed'].value_counts(normalize=True)

In [None]:
X = df_processed.drop(columns=['is_delayed'])
y = df_processed['is_delayed']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
feature_names = X_train.columns.tolist()
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_names)

In [None]:
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss', 
#     scale_pos_weight=2,
    n_estimators=5000,
    learning_rate=0.01,
    subsample=0.6,
    max_depth=5,
    random_state=42
)

In [None]:
xgb_model.fit(X_train_scaled_df, y_train)

In [None]:
# Predict probabilities for AUC calculation
y_pred_proba = xgb_model.predict_proba(X_test_scaled_df)[:, 1]

# Predict class labels
y_pred = xgb_model.predict(X_test_scaled_df)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred_proba)
cm = confusion_matrix(y_test, y_pred)

print("\n### Model Evaluation Results (Test Set) ###")
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC-AUC Score: {auc_score:.4f}")
print("\nConfusion Matrix:")
print(cm)