In [1]:
import pandas as pd

csv_path = "flight_status_final_filtered.csv"

df = pd.read_csv(csv_path)

print("✅ CSV 로드 완료")
print("행 수:", len(df))
df.head()


  df = pd.read_csv(csv_path)


✅ CSV 로드 완료
행 수: 1500000


Unnamed: 0,AIRLINE_ENGLISH,AIRLINE_KOREAN,AIRPORT,AIR_FLN,ARRIVED_ENG,ARRIVED_KOR,BAGGAGE_CLAIM,BOARDING_ENG,BOARDING_KOR,CITY,...,FLIGHT_DATE,GATE,IO,LINE,LINE_CODE,RMK_ENG,RMK_KOR,STD,UFID,flight_date
0,JEJU AIR,제주항공,CJJ,7C802,CHEONGJU,청주,1.0,JEJU,제주,CJU,...,20230430,,I,국내,D,ARRIVED,도착,1040,20230430CJJI7C802,2023-04-30
1,JEJU AIR,제주항공,CJJ,7C850,CHEONGJU,청주,1.0,JEJU,제주,CJU,...,20230430,,I,국내,D,DELAY,지연,2320,20230430CJJI7C850,2023-04-30
2,JEJU AIR,제주항공,CJJ,7C852,CHEONGJU,청주,1.0,JEJU,제주,CJU,...,20230430,,I,국내,D,ARRIVED,도착,920,20230430CJJI7C852,2023-04-30
3,JEJU AIR,제주항공,CJJ,7C854,CHEONGJU,청주,1.0,JEJU,제주,CJU,...,20230430,,I,국내,D,ARRIVED,도착,1840,20230430CJJI7C854,2023-04-30
4,JEJU AIR,제주항공,CJJ,7C856,CHEONGJU,청주,1.0,JEJU,제주,CJU,...,20230430,,I,국내,D,ARRIVED,도착,1810,20230430CJJI7C856,2023-04-30


In [2]:
#타겟 변수(is_delay) 생성
def make_delay_label(x):
    if pd.isna(x):
        return 0
    if any(k in x for k in ["지연", "결항", "회항"]):
        return 1
    return 0

df["is_delay"] = df["RMK_KOR"].apply(make_delay_label)


In [3]:
df["is_delay"].value_counts()

0    1487392
1      12608
Name: is_delay, dtype: int64

In [4]:
# 날짜 파생
df["flight_date"] = pd.to_datetime(df["FLIGHT_DATE"], format="%Y%m%d")
df["weekday"] = df["flight_date"].dt.weekday
df["month"] = df["flight_date"].dt.month

# 시간 파생
df["STD"] = df["STD"].astype(str)
df["hour"] = df["STD"].str.zfill(4).str[:2].astype(int)


In [5]:
#train / test 분리 (날짜 기준)

# 날짜 기준으로 정렬 (가장 중요)
df = df.sort_values("flight_date")

# 상위 80% 날짜를 학습용, 나머지 20%를 테스트용으로 사용
split_date = df["flight_date"].quantile(0.8)

# 학습 데이터 (과거)
train_df = df[df["flight_date"] <= split_date]

# 테스트 데이터 (최근)
test_df = df[df["flight_date"] > split_date]

In [6]:
# 독립변수(X) / 종속변수(y) 분리

X_columns = [
    "weekday", "month", "hour",
    "AIRLINE_KOREAN", "BOARDING_KOR", "ARRIVED_KOR",
    "LINE", "IO"
]

X_train = train_df[X_columns]
y_train = train_df["is_delay"]

X_test = test_df[X_columns]
y_test = test_df["is_delay"]


In [7]:
# 소수 클래스(1)에 더 큰 가중치 부여 (클래스 불균형 대응)
class_weight = {0: 1, 1: 70}


In [8]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

cat_cols = [
    "AIRLINE_KOREAN",
    "BOARDING_KOR",
    "ARRIVED_KOR",
    "LINE",
    "IO"
]

num_cols = ["weekday", "month", "hour"]

# 범주형 파이프라인
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OrdinalEncoder(
        handle_unknown="use_encoded_value",
        unknown_value=-1
    ))
])

# 수치형 파이프라인
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

# 전처리 결합
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, cat_cols),
        ("num", num_pipeline, num_cols)
    ]
)

# RandomForest 모델
rf_model_label = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(
        n_estimators=300,
        min_samples_leaf=20,
        class_weight={0: 1, 1: 30},
        random_state=42,
        n_jobs=-1
    ))
])

# 학습
rf_model_label.fit(X_train, y_train)


In [9]:
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import numpy as np

y_pred_rf = rf_model_label.predict(X_test)
y_prob_rf = rf_model_label.predict_proba(X_test)[:, 1]

In [11]:
print("===== RandomForest (Label Encoding) | threshold=0.5 =====")
print(classification_report(y_test, y_pred_rf))

print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))
print("PR-AUC:", average_precision_score(y_test, y_prob_rf))


===== RandomForest (Label Encoding) | threshold=0.5 =====
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    298339
           1       0.02      0.12      0.03      1079

    accuracy                           0.98    299418
   macro avg       0.51      0.55      0.51    299418
weighted avg       0.99      0.98      0.98    299418

ROC-AUC: 0.7874122309581576
PR-AUC: 0.01256038653552627


In [12]:
for t in [0.3, 0.4, 0.5, 0.6, 0.7]:
    y_pred_t = (y_prob_rf >= t).astype(int)
    print(f"\n===== Threshold = {t} =====")
    print(classification_report(y_test, y_pred_t))



===== Threshold = 0.3 =====
              precision    recall  f1-score   support

           0       1.00      0.94      0.97    298339
           1       0.02      0.25      0.03      1079

    accuracy                           0.94    299418
   macro avg       0.51      0.60      0.50    299418
weighted avg       0.99      0.94      0.97    299418


===== Threshold = 0.4 =====
              precision    recall  f1-score   support

           0       1.00      0.96      0.98    298339
           1       0.02      0.17      0.03      1079

    accuracy                           0.96    299418
   macro avg       0.51      0.56      0.50    299418
weighted avg       0.99      0.96      0.98    299418


===== Threshold = 0.5 =====
              precision    recall  f1-score   support

           0       1.00      0.98      0.99    298339
           1       0.02      0.12      0.03      1079

    accuracy                           0.98    299418
   macro avg       0.51      0.55      0.