In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import shap

In [None]:
# -------------------------------------------------------------------
# 1. 샘플 데이터 생성
# -------------------------------------------------------------------
def generate_sample_data(n=1200):
    np.random.seed(42)
    dates = pd.date_range("2021-01-01", periods=n)

    Planned = np.linspace(1, 100, n)
    Actual = Planned - np.random.normal(0, 5, n) - np.random.uniform(0, 3, n)
    DelayDays = np.maximum(0, (Planned - Actual) * 0.2 + np.random.normal(0, 1, n))

    df = pd.DataFrame({
        "Date": dates,
        "PlannedProgress": Planned,
        "ActualProgress": Actual,
        "LaborCnt": np.random.randint(10, 60, n),
        "EquipHours": np.random.uniform(5, 20, n),
        "Rain": np.random.exponential(1, n),
        "Wind": np.random.uniform(0, 10, n),
        "Temp": np.random.uniform(-5, 35, n),
        "DelayDays": DelayDays
    })
    return df


In [None]:
# -------------------------------------------------------------------
# 2. 전처리 & 파생변수 생성
# -------------------------------------------------------------------
def preprocess(df):

    # 날짜 파생
    df["Date"] = pd.to_datetime(df["Date"])
    df["Month"] = df["Date"].dt.month
    df["IsWeekend"] = df["Date"].dt.weekday >= 5

    # 파생: DelayGap
    df["DelayGap"] = df["PlannedProgress"] - df["ActualProgress"]

    # Weather Index (PDF 정의)
    df["WeatherIndex"] = df["Rain"] + df["Wind"] * 0.5

    # Resource Ratio
    df["ResourceRatio"] = df["EquipHours"] / (df["LaborCnt"] + 1)

    # Weekend → 0/1
    df["IsWeekend"] = df["IsWeekend"].astype(int)

    return df


In [None]:
# -------------------------------------------------------------------
# 3. Baseline : Linear Regression
# -------------------------------------------------------------------
def baseline_linear_regression(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, pred))
    r2 = r2_score(y_test, pred)
    mae = mean_absolute_error(y_test, pred)

    return model, rmse, r2, mae


In [None]:
# -------------------------------------------------------------------
# 4. RandomForest 모델
# -------------------------------------------------------------------
def random_forest_model(X_train, X_test, y_train, y_test):
    rf = RandomForestRegressor(
        n_estimators=300,
        max_depth=10,
        random_state=42
    )
    rf.fit(X_train, y_train)
    pred = rf.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, pred))
    r2 = r2_score(y_test, pred)
    mae = mean_absolute_error(y_test, pred)

    return rf, rmse, r2, mae

In [None]:

# -------------------------------------------------------------------
# 5. XGBoost (PDF 기준 Best Model)
# -------------------------------------------------------------------
def xgboost_model(X_train, X_test, y_train, y_test):
    xgb = XGBRegressor(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    xgb.fit(X_train, y_train)
    pred = xgb.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, pred))
    r2 = r2_score(y_test, pred)
    mae = mean_absolute_error(y_test, pred)

    return xgb, rmse, r2, mae


In [None]:
# -------------------------------------------------------------------
# 6. 전체 파이프라인 실행
# -------------------------------------------------------------------
def run_pipeline(csv_path=None):

    # 1) CSV가 있으면 로드, 없으면 샘플 생성
    df = pd.read_csv(csv_path) if csv_path else generate_sample_data()

    # 2) 전처리
    df = preprocess(df)

    # 3) Feature 선택 (PDF 기준)
    features = [
        "DelayGap", "Rain", "Wind", "EquipHours", "LaborCnt",
        "WeatherIndex", "Month", "ResourceRatio", "IsWeekend"
    ]
    X = df[features]
    y = df["DelayDays"]

    # Train / Test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Baseline
    lr, lr_rmse, lr_r2, lr_mae = baseline_linear_regression(X_train, X_test, y_train, y_test)

    # RandomForest
    rf, rf_rmse, rf_r2, rf_mae = random_forest_model(X_train, X_test, y_train, y_test)

    # XGBoost
    xgb, xgb_rmse, xgb_r2, xgb_mae = xgboost_model(X_train, X_test, y_train, y_test)

    # 성능 요약 출력
    print("\n=== 모델 성능 비교 ===")
    print(f"Linear Regression: RMSE={lr_rmse:.2f}, R2={lr_r2:.2f}, MAE={lr_mae:.2f}")
    print(f"RandomForest:      RMSE={rf_rmse:.2f}, R2={rf_r2:.2f}, MAE={rf_mae:.2f}")
    print(f"XGBoost (Best):    RMSE={xgb_rmse:.2f}, R2={xgb_r2:.2f}, MAE={xgb_mae:.2f}")

In [None]:
   # -------------------------------------------------------------------
    # 7. SHAP 기반 XAI (PDF 제7절)
    # -------------------------------------------------------------------
    explainer = shap.TreeExplainer(xgb)
    shap_values = explainer.shap_values(X_test)

    print("\nSHAP Summary Plot 생성중...")
    shap.summary_plot(shap_values, X_test, show=False)
    plt.tight_layout()
    plt.show()

    return {
        "df": df,
        "models": {
            "LR": lr, "RF": rf, "XGB": xgb
        }
    }
    # ----------------------------------------------------------
# 실행
# ----------------------------------------------------------
if __name__ == "__main__":
    run_pipeline()

