In [1]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

# =====================================================
# 1Ô∏è‚É£ Îç∞Ïù¥ÌÑ∞ Î°úÎìú + ÏãúÍ∞Ñ ÌååÏÉù
# =====================================================
df = pd.read_csv("new_flight_weather_merged.csv")

df["departure_datetime"] = pd.to_datetime(df["departure_datetime"])
df["dep_hour"] = df["departure_datetime"].dt.hour
df["dep_weekday"] = df["departure_datetime"].dt.weekday
df["is_weekend"] = df["dep_weekday"].isin([5, 6]).astype(int)

print("Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞ Ïàò:", len(df))

# =====================================================
# 2Ô∏è‚É£ Ïª¨Îüº Ï†ïÏùò
# =====================================================
num_cols = ["Í∏∞Ïò®(¬∞C)", "ÌíçÏÜç_ms", "dep_hour", "dep_weekday", "is_weekend"]
num_cols = [c for c in num_cols if c in df.columns]

cat_cols = ["Ìï≠Í≥µÏÇ¨", "Ï∂úÎ∞úÏßÄ", "ÎèÑÏ∞©ÏßÄ", "flight_type"]
cat_cols = [c for c in cat_cols if c in df.columns]

# LightGBMÏö© category Î≥ÄÌôò
for c in cat_cols:
    df[c] = df[c].astype("category")

X_cols = num_cols + cat_cols

# =====================================================
# 3Ô∏è‚É£ Train / Test Î∂ÑÎ¶¨ (ÏãúÍ∞Ñ Í∏∞Ï§Ä)
# =====================================================
df = df.sort_values("departure_datetime")
split_date = df["departure_datetime"].quantile(0.8)

train_df = df[df["departure_datetime"] <= split_date]
test_df  = df[df["departure_datetime"] > split_date]

print("Train:", len(train_df), "Test:", len(test_df))

# =====================================================
# 4Ô∏è‚É£ üî• Îã§Ïö¥ÏÇ¨Ïù¥Ïßï (Train Îç∞Ïù¥ÌÑ∞Îßå)
# =====================================================
train_0 = train_df[train_df["is_delay"] == 0]
train_1 = train_df[train_df["is_delay"] == 1]

print("Before Downsampling")
print("Ï†ïÏÉÅ(0):", len(train_0), "ÏßÄÏó∞(1):", len(train_1))

# üëâ Ï†ïÏÉÅ(0)ÏùÑ ÏßÄÏó∞(1) Í∞úÏàòÎßåÌÅº ÎûúÎç§ ÏÉòÌîåÎßÅ
train_0_down = train_0.sample(
    n=len(train_1),
    random_state=42
)

train_down = (
    pd.concat([train_0_down, train_1])
    .sample(frac=1, random_state=42)
)

print("After Downsampling")
print(train_down["is_delay"].value_counts())

# =====================================================
# 5Ô∏è‚É£ X / y Î∂ÑÎ¶¨
# =====================================================
X_train = train_down[X_cols]
y_train = train_down["is_delay"]

X_test  = test_df[X_cols]
y_test  = test_df["is_delay"]

# =====================================================
# 6Ô∏è‚É£ LightGBM Î™®Îç∏ (‚ùå class_weight Ï†úÍ±∞)
# =====================================================
lgbm = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,

    objective="binary",
    metric="aucpr",
    random_state=42,
    n_jobs=-1
)

# categorical_feature ÏßÄÏ†ï
lgbm.fit(
    X_train,
    y_train,
    categorical_feature=cat_cols
)

print("‚úÖ LightGBM (Downsampling) ÌïôÏäµ ÏôÑÎ£å")

# =====================================================
# 7Ô∏è‚É£ ÌèâÍ∞Ä
# =====================================================
y_prob = lgbm.predict_proba(X_test)[:, 1]

for t in [0.3, 0.35, 0.4, 0.45, 0.5]:
    print(f"\n===== Threshold = {t} =====")
    y_pred = (y_prob >= t).astype(int)
    print(classification_report(y_test, y_pred))

print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("PR-AUC :", average_precision_score(y_test, y_prob))


  df = pd.read_csv("new_flight_weather_merged.csv")


Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞ Ïàò: 2843934
Train: 2275147 Test: 568787
Before Downsampling
Ï†ïÏÉÅ(0): 1938804 ÏßÄÏó∞(1): 336343
After Downsampling
0    336343
1    336343
Name: is_delay, dtype: int64
[LightGBM] [Info] Number of positive: 336343, number of negative: 336343
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 664
[LightGBM] [Info] Number of data points in the train set: 672686, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
‚úÖ LightGBM (Downsampling) ÌïôÏäµ ÏôÑÎ£å

===== Threshold = 0.3 =====
              precision    recall  f1-score   support

           0       0.90      0.33      0.48    422243
           1       0.32      0.90      0.47    146544

    accuracy                           0.47    568787
   macro avg       0