In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# ---------- 분류용 모델 (Classification) ----------
# from sklearn.tree import DecisionTreeClassifier        # 분류
# from sklearn.ensemble import RandomForestClassifier    # 분류
# from sklearn.linear_model import LogisticRegression    # 분류
# from sklearn.metrics import accuracy_score             # 분류 평가용

# ---------- 회귀용 모델 (Regression) ----------
from sklearn.tree import DecisionTreeRegressor          # 회귀
from sklearn.ensemble import RandomForestRegressor      # 회귀
from sklearn.linear_model import LinearRegression       # 회귀
from sklearn.metrics import mean_squared_error          # 회귀 평가용

# -----------------------------
# 1) 데이터 준비
# -----------------------------
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/diabetes.csv").dropna()

X = df.drop(columns=["Outcome", "BMI"])
y = df["BMI"]   # BMI는 연속형 값 → 회귀 문제

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    # stratify=y   # stratify는 "분류 전용 옵션"이라 회귀에서는 사용 불가
    )

# -----------------------------
# 2) 모델 구성
# -----------------------------
# dt = DecisionTreeClassifier(random_state=42)                     # 분류
# rf = RandomForestClassifier(n_estimators=200, random_state=42)   # 분류
# lr = LogisticRegression(max_iter=500)                            # 분류

dt = DecisionTreeRegressor(random_state=42)                       # 회귀
rf = RandomForestRegressor(n_estimators=200, random_state=42)     # 회귀
lr = LinearRegression()                                           # 회귀

# -----------------------------
# 3) 모델 학습
# -----------------------------
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------
# dt_acc = accuracy_score(y_test, dt.predict(X_test))   # 분류
# rf_acc = accuracy_score(y_test, rf.predict(X_test))   # 분류
# lr_acc = accuracy_score(y_test, lr.predict(X_test))   # 분류

dt_mse = mean_squared_error(y_test, dt.predict(X_test)) # 회귀
rf_mse = mean_squared_error(y_test, rf.predict(X_test)) # 회귀
lr_mse = mean_squared_error(y_test, lr.predict(X_test)) # 회귀

print("=== Test MSE ===")
print(f"Decision Tree : {dt_mse:.4f}")
print(f"Random Forest : {rf_mse:.4f}")
print(f"Linear Reg.   : {lr_mse:.4f}")

=== Test MSE ===
Decision Tree : 104.1275
Random Forest : 46.7562
Linear Reg.   : 52.2406


In [None]:
df.columns

Index(['Unnamed: 0', 'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms',
       'Population', 'AveOccup', 'Latitude', 'Longitude', 'PRICE'],
      dtype='object')