<a href="https://colab.research.google.com/github/Iktaik-Kim/MCP/blob/main/AI_%EB%B9%84%EA%B5%90.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 필요한 패키지: pandas, numpy, scikit-learn, (가능하면 xgboost)
import pandas as pd, numpy as np
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor

# 1) 데이터 적재
from google.colab import files
uploaded = files.upload()

import pandas as pd

df = pd.read_excel("AI comparison.xlsx")
df.head()


df = pd.read_excel("C:/Users/user/Desktop/AI comparison.xlsx")
df.columns = [str(c).strip() for c in df.columns]

# 열 매핑 (대소문자/괄호 변형 대응)
def find_col(keys):
    for c in df.columns:
        lc = c.lower()
        if any(k in lc for k in keys):
            return c
    return None

col_no = find_col(["no"])
col_a  = find_col(["price","basis"])
col_b  = find_col(["present","presenting"," b",")b","(b"," b)"]) or "B"

assert all([col_no, col_a, col_b]), f"컬럼 확인: {df.columns}"

def to_num(s):
    return pd.to_numeric(
        s.astype(str).str.replace(",","",regex=False)
                      .str.replace(" ","",regex=False)
                      .str.replace("₩","",regex=False)
                      .str.replace("%","",regex=False),
        errors="coerce"
    )

A = to_num(df[col_a])
B = to_num(df[col_b])

no_raw = df[col_no].astype(str)
no_num = pd.to_numeric(no_raw, errors="coerce")
if no_num.isna().all():              # 모두 비수치면 factorize
    no_num = pd.Series(pd.factorize(no_raw)[0], index=df.index)

data = pd.DataFrame({"A":A, "No":no_num, "B":B}).dropna()

X = data[["A","No"]].copy()
y = data["B"].copy()

# 2) 공통 검증 설정
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def report(y_true, y_pred):
    return dict(
        MAE = mean_absolute_error(y_true, y_pred),
        RMSE = mean_squared_error(y_true, y_pred, squared=False),
        MAPE = np.mean(np.abs((y_true - y_pred)/np.where(y_true==0, np.nan, y_true)))*100,
        R2 = r2_score(y_true, y_pred),
    )

# 3) OLS
ols = LinearRegression()
ols_pred = cross_val_predict(ols, X, y, cv=cv, n_jobs=-1)
ols.fit(X, y)
ols_metrics = report(y, ols_pred)
ols_eq = f"B = {ols.intercept_:,.6f} + {ols.coef_[0]:,.6f}·A + {ols.coef_[1]:,.6f}·No"

# 4) Random Forest
rf = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
rf_pred = cross_val_predict(rf, X, y, cv=cv, n_jobs=-1)
rf.fit(X, y)
rf_metrics = report(y, rf_pred)
rf_importance = dict(zip(X.columns, rf.feature_importances_))

# 5) XGBoost (없으면 sklearn HGB 대체)
try:
    from xgboost import XGBRegressor
    xgb = XGBRegressor(
        n_estimators=800, learning_rate=0.05, max_depth=4,
        subsample=0.9, colsample_bytree=0.9, reg_lambda=1.0,
        random_state=42, objective="reg:squarederror", tree_method="hist"
    )
    backend = "xgboost"
except Exception:
    xgb = HistGradientBoostingRegressor(max_depth=4, learning_rate=0.05, max_iter=800, random_state=42)
    backend = "sklearn_HistGradientBoosting"

xgb_pred = cross_val_predict(xgb, X, y, cv=cv, n_jobs=-1)
xgb.fit(X, y)
xgb_metrics = report(y, xgb_pred)

# 6) 근사회귀식 만들기(라쏘 + 2차 다항)
poly = PolynomialFeatures(degree=2, include_bias=True)
Phi = poly.fit_transform(X)
terms = poly.get_feature_names_out(["A","No"])  # ['1','A','No','A^2','A No','No^2']

def surrogate_equation(fitted_model, name=""):
    y_m = fitted_model.predict(X)
    lasso = LassoCV(cv=5, alphas=np.logspace(-6,2,60), random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(Phi, y_m)
    coefs = {t: c for t,c in zip(terms, lasso.coef_) if abs(c) > 1e-8}
    intercept = lasso.intercept_
    # 보기 좋게 정렬
    order = [t for t in ["1","A","No","A^2","A No","No^2"] if t in coefs] + [t for t in coefs if t not in ["1","A","No","A^2","A No","No^2"]]
    s = f"ŷ_{name} = {intercept:,.6f}"
    for t in order:
        s += (" + " if coefs[t] >= 0 else " - ") + f"{abs(coefs[t]):,.6f}·{t}"
    return s.replace("·1","")

rf_eq = surrogate_equation(rf, "RF")
xgb_eq = surrogate_equation(xgb, "XGB")

# 7) 요약 표
import pandas as pd
summary = pd.DataFrame([
    {"Model":"OLS (Linear)", **ols_metrics},
    {"Model":"Random Forest", **rf_metrics},
    {"Model":f"XGBoost ({backend})", **xgb_metrics},
])
print(summary.round(4))
print("\n[OLS 식]\n", ols_eq)
print("\n[RF 근사회귀식]\n", rf_eq)
print("\n[XGB 근사회귀식]\n", xgb_eq)
print("\n[RF 중요도]\n", rf_importance)
