In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    average_precision_score
)

# =========================
# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
# =========================
df = pd.read_csv("new_flight_weather_merged.csv")
print("‚úÖ CSV Î°úÎìú:", len(df))

# =========================
# ÏãúÍ∞Ñ ÌååÏÉù Î≥ÄÏàò
# =========================
df["departure_datetime"] = pd.to_datetime(df["departure_datetime"])

df["dep_hour"] = df["departure_datetime"].dt.hour
df["dep_weekday"] = df["departure_datetime"].dt.weekday
df["is_weekend"] = df["dep_weekday"].isin([5, 6]).astype(int)


  df = pd.read_csv("new_flight_weather_merged.csv")


‚úÖ CSV Î°úÎìú: 2843934


In [2]:
# üî¢ ÏàòÏπòÌòï
num_cols = [
    "Í∏∞Ïò®(¬∞C)",
    "ÌíçÏÜç_ms",
    "dep_hour",
    "dep_weekday",
    "is_weekend"
]
num_cols = [c for c in num_cols if c in df.columns]

# üè∑ Î≤îÏ£ºÌòï
cat_cols = [
    "Í≥µÌï≠Î™Ö",
    "Ï∂úÎ∞úÏßÄ",
    "ÎèÑÏ∞©ÏßÄ",
    "flight_type"
]
cat_cols = [c for c in cat_cols if c in df.columns]

X_cols = num_cols + cat_cols


In [3]:
df = df.sort_values("departure_datetime")
split_date = df["departure_datetime"].quantile(0.8)

train_df = df[df["departure_datetime"] <= split_date]
test_df  = df[df["departure_datetime"] > split_date]

X_train = train_df[X_cols]
y_train = train_df["is_delay"]

X_test  = test_df[X_cols]
y_test  = test_df["is_delay"]

print("Train:", len(train_df), "Test:", len(test_df))


Train: 2275147 Test: 568787


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="UNKNOWN")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=True))
])

preprocessor = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

logistic = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(
        max_iter=3000,
        class_weight="balanced",
        n_jobs=-1
    ))
])

logistic.fit(X_train, y_train)

log_prob = logistic.predict_proba(X_test)[:, 1]
log_pred = (log_prob >= 0.4).astype(int)

print("\nüìä Logistic (threshold=0.4)")
print(classification_report(y_test, log_pred))
print("ROC-AUC:", roc_auc_score(y_test, log_prob))
print("PR-AUC :", average_precision_score(y_test, log_prob))





üìä Logistic (threshold=0.4)
              precision    recall  f1-score   support

           0       0.86      0.38      0.53    422243
           1       0.31      0.82      0.45    146544

    accuracy                           0.49    568787
   macro avg       0.59      0.60      0.49    568787
weighted avg       0.72      0.49      0.51    568787

ROC-AUC: 0.6474615621075573
PR-AUC : 0.3721759150947776


In [5]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,

    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum(),
    objective="binary:logistic",
    eval_metric="aucpr",
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

xgb_model = Pipeline([
    ("prep", preprocessor),
    ("xgb", xgb)
])

xgb_model.fit(X_train, y_train)

xgb_prob = xgb_model.predict_proba(X_test)[:, 1]
xgb_pred = (xgb_prob >= 0.4).astype(int)

print("\nüìä XGBoost (threshold=0.4)")
print(classification_report(y_test, xgb_pred))
print("ROC-AUC:", roc_auc_score(y_test, xgb_prob))
print("PR-AUC :", average_precision_score(y_test, xgb_prob))





üìä XGBoost (threshold=0.4)
              precision    recall  f1-score   support

           0       0.88      0.43      0.57    422243
           1       0.34      0.83      0.48    146544

    accuracy                           0.53    568787
   macro avg       0.61      0.63      0.53    568787
weighted avg       0.74      0.53      0.55    568787

ROC-AUC: 0.6854605778465136
PR-AUC : 0.4262482943255317


In [6]:
from lightgbm import LGBMClassifier

# LightGBMÏö© category Î≥ÄÌôò
for c in cat_cols:
    df[c] = df[c].astype("category")

train_df = df[df["departure_datetime"] <= split_date]
test_df  = df[df["departure_datetime"] > split_date]

X_train_lgb = train_df[X_cols]
y_train_lgb = train_df["is_delay"]

X_test_lgb  = test_df[X_cols]
y_test_lgb  = test_df["is_delay"]

lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,

    class_weight="balanced",
    objective="binary",
    metric="aucpr",
    random_state=42,
    n_jobs=-1
)

lgbm.fit(
    X_train_lgb,
    y_train_lgb,
    categorical_feature=cat_cols
)

lgb_prob = lgbm.predict_proba(X_test_lgb)[:, 1]
lgb_pred = (lgb_prob >= 0.4).astype(int)

print("\nüìä LightGBM (threshold=0.4)")
print(classification_report(y_test_lgb, lgb_pred))
print("ROC-AUC:", roc_auc_score(y_test_lgb, lgb_prob))
print("PR-AUC :", average_precision_score(y_test_lgb, lgb_prob))


[LightGBM] [Info] Number of positive: 336343, number of negative: 1938804
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 661
[LightGBM] [Info] Number of data points in the train set: 2275147, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

üìä LightGBM (threshold=0.4)
              precision    recall  f1-score   support

           0       0.88      0.48      0.62    422243
           1       0.35      0.80      0.49    146544

    accuracy                           0.56    568787
   macro avg       0.61      0.64      0.55    568787
weighted avg       0.74      0.56      0.58    568787

ROC-AUC: 0.7059227082554224
PR-AUC : 0.4578427366604109


In [7]:
summary = pd.DataFrame([
    ["Logistic", roc_auc_score(y_test, log_prob), average_precision_score(y_test, log_prob)],
    ["XGBoost",  roc_auc_score(y_test, xgb_prob), average_precision_score(y_test, xgb_prob)],
    ["LightGBM", roc_auc_score(y_test_lgb, lgb_prob), average_precision_score(y_test_lgb, lgb_prob)]
], columns=["Model", "ROC-AUC", "PR-AUC"])

summary


Unnamed: 0,Model,ROC-AUC,PR-AUC
0,Logistic,0.647462,0.372176
1,XGBoost,0.685461,0.426248
2,LightGBM,0.705923,0.457843
