In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    precision_score,
    recall_score,
    f1_score
)


In [2]:
df = pd.read_csv("ready_for_modeling.csv")
print(df.shape)
df.head()

(1500000, 20)


Unnamed: 0,DOT_CODE,FL_NUMBER,CRS_DEP_TIME,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE,DELAYED,MONTH,DAY_OF_WEEK,IS_WEEKEND,DEP_HOUR,AIR_TIME_MISSING,CRS_ELAPSED_TIME_MISSING,ORIGIN_FREQ,DEST_FREQ,AIRLINE_FREQ,AIRLINE_DOT_FREQ,AIRLINE_CODE_FREQ,ORIGIN_CITY_FREQ,DEST_CITY_FREQ
0,19977,1562,1155,1501,186.0,1065.0,0,1,2,0,11.0,0,0,0.013444,0.017665,0.084963,0.084963,0.084963,0.013444,0.017665
1,19790,1149,2120,2315,235.0,1399.0,0,11,5,1,21.0,0,0,0.020046,0.023635,0.131689,0.131689,0.131689,0.020046,0.023635
2,19977,459,954,1252,118.0,680.0,0,7,4,0,10.0,0,0,0.040012,0.019821,0.084963,0.084963,0.084963,0.040012,0.019821
3,19790,2295,1609,1829,260.0,1589.0,1,3,0,0,16.0,0,0,0.020046,0.019668,0.131689,0.131689,0.131689,0.020046,0.019668
4,20416,407,1840,2041,181.0,985.0,0,2,6,1,18.0,0,0,0.021439,0.043451,0.031889,0.031889,0.031889,0.021439,0.043451


In [9]:

from lightgbm import LGBMClassifier, early_stopping, log_evaluation


TARGET = "DELAYED"

X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Model
lgb_model = LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    learning_rate=0.05,
    n_estimators=200,
    num_leaves=31,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# Train (version-safe)
lgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
    callbacks=[
        early_stopping(stopping_rounds=30),
        log_evaluation(50)
    ]
)

# Predict
y_prob_lgb = lgb_model.predict_proba(X_val)[:, 1]

# ROC-AUC
roc_auc_lgb = roc_auc_score(y_val, y_prob_lgb)
print(f"LightGBM ROC-AUC: {roc_auc_lgb:.4f}")

# Threshold evaluation
threshold = 0.7
y_pred_lgb = (y_prob_lgb >= threshold).astype(int)

print(classification_report(y_val, y_pred_lgb))

print("Precision:", precision_score(y_val, y_pred_lgb))
print("Recall   :", recall_score(y_val, y_pred_lgb))
print("F1-score :", f1_score(y_val, y_pred_lgb))

# Feature importance
importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": lgb_model.feature_importances_
}).sort_values(by="importance", ascending=False)

importance_df.head(15)

[LightGBM] [Info] Number of positive: 214164, number of negative: 985836
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054659 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2079
[LightGBM] [Info] Number of data points in the train set: 1200000, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.178470 -> initscore=-1.526748
[LightGBM] [Info] Start training from score -1.526748
Training until validation scores don't improve for 30 rounds
[50]	valid_0's auc: 0.860161	valid_0's binary_logloss: 0.309044
[100]	valid_0's auc: 0.872014	valid_0's binary_logloss: 0.284269
[150]	valid_0's auc: 0.875669	valid_0's binary_logloss: 0.274629
[200]	valid_0's auc: 0.87817	valid_0's binary_logloss: 0.269607
Did not meet early stopping. Best iteration is:
[200]	valid_0's auc: 0.87817	valid_0's binary_logloss: 0.269607
LightGBM ROC-AUC: 0.8782
              precision    recall  f1-score   

Unnamed: 0,feature,importance
2,CRS_DEP_TIME,1987
9,DEP_HOUR,1724
3,CRS_ARR_TIME,418
6,MONTH,304
0,DOT_CODE,230
1,FL_NUMBER,160
13,DEST_FREQ,155
5,DISTANCE,154
10,AIR_TIME_MISSING,143
12,ORIGIN_FREQ,143
