# 항공사 데이터_회귀분석_Linear_Ridge_LogRidge

날씨 데이터 없이 **항공사 운항 정보만**으로 지연 시간을 예측

In [None]:
import pandas as pd
import numpy as np

# =========================
# 항공사 데이터 로드 (날씨 데이터 제거 버전)
# =========================
df = pd.read_csv("new_flight_analysis_summary.csv", encoding="utf-8-sig")

# =========================
# 회귀 타깃 (지연 시간)
# =========================
df = df[df["지연_분"].notna()].copy()

print("✅ 회귀 대상 데이터 수:", len(df))

# =========================
# 변수 정의
# =========================
num_cols = [
    "dep_hour",
    "dep_min",        # ✅ CSV 기준으로 통일
    "dep_weekday",
    "is_weekend"
]

cat_cols = [
    "항공사",
    "출발지",
    "arrival_code",
    "flight_type"
]

X = df[num_cols + cat_cols]
y = df["지연_분"]


In [None]:

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

prep = ColumnTransformer([
    ("num", "passthrough", num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
])

models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Log-Ridge": Ridge(alpha=1.0)
}

results = []

for name, model in models.items():
    pipe = Pipeline([("prep", prep), ("model", model)])
    if name == "Log-Ridge":
        pipe.fit(X_train, np.log1p(y_train))
        pred = np.expm1(pipe.predict(X_test))
    else:
        pipe.fit(X_train, y_train)
        pred = pipe.predict(X_test)

    results.append({
        "model": name,
        "MAE": mean_absolute_error(y_test, pred),
        "RMSE": mean_squared_error(y_test, pred, squared=False),
        "R2": r2_score(y_test, pred)
    })

pd.DataFrame(results)
