In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

RANDOM_STATE = 42

# ===== 1. 데이터 수집 =====
iris = load_iris(as_frame=True)
df = iris.frame  # feature + target(DataFrame)
df.rename(columns={"target": "label"}, inplace=True)

# (옵션) NaN/이상치가 실제 데이터에는 있을 수 있으므로 확인
print("원본 데이터 형태:", df.shape)
print("결측치 개수:\n", df.isna().sum())

# ===== 2. 전처리(기초) =====
# 이번 예제는 모두 수치형이라 원-핫 인코딩 불필요
X = df.drop(columns=["label"]).copy()
y = df["label"].copy().astype(int)

# ===== 3. 데이터 3분할 (Train / Val / Test) =====
# 1) 먼저 Train+Val vs Test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)
# 2) Train vs Val (남은 80% 중 25%를 Val로 → 전체의 20%)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=RANDOM_STATE, stratify=y_temp
)
print("분할:", X_train.shape, X_val.shape, X_test.shape)

# ===== 4. 전처리(학습셋 기준으로 fit) =====
# (1) 결측치 대체
imputer = SimpleImputer(strategy="median")
imputer.fit(X_train)

X_train_imp = imputer.transform(X_train)
X_val_imp   = imputer.transform(X_val)
X_test_imp  = imputer.transform(X_test)

# (2) 스케일링 (트리모델은 필수는 아니지만 예시로 포함)
scaler = StandardScaler()
scaler.fit(X_train_imp)

X_train_ready = scaler.transform(X_train_imp)
X_val_ready   = scaler.transform(X_val_imp)
X_test_ready  = scaler.transform(X_test_imp)

# ===== 5. 모델 생성 =====
# 기준 모델
clf = DecisionTreeClassifier(random_state=RANDOM_STATE)

# ===== 6. 모델 학습(Train) =====
clf.fit(X_train_ready, y_train)

# ===== 7. 검증(Validation) =====
val_pred = clf.predict(X_val_ready)
val_acc = accuracy_score(y_val, val_pred)
print(f"[기준모델] Validation Accuracy: {val_acc:.4f}")

# ===== 8. 개선(간단한 하이퍼파라미터 탐색) =====
param_grid = {
    "max_depth": [None, 2, 3, 4, 5, 8],
    "min_samples_split": [2, 4, 8, 16]
}
best_score = -1
best_params = None

for md in param_grid["max_depth"]:
    for mss in param_grid["min_samples_split"]:
        trial = DecisionTreeClassifier(
            random_state=RANDOM_STATE,
            max_depth=md,
            min_samples_split=mss
        )
        trial.fit(X_train_ready, y_train)
        pred = trial.predict(X_val_ready)
        acc = accuracy_score(y_val, pred)
        if acc > best_score:
            best_score = acc
            best_params = {"max_depth": md, "min_samples_split": mss}

print("[탐색] Best params:", best_params, "Val Acc:", round(best_score, 4))

# ===== 9. Train+Val로 재학습 후 Test 평가 =====
X_trainval_ready = np.vstack([X_train_ready, X_val_ready])
y_trainval = pd.concat([y_train, y_val], axis=0)

final_clf = DecisionTreeClassifier(
    random_state=RANDOM_STATE,
    **best_params
)
final_clf.fit(X_trainval_ready, y_trainval)

test_pred = final_clf.predict(X_test_ready)
test_acc = accuracy_score(y_test, test_pred)
print(f"[최종모델] Test Accuracy: {test_acc:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, test_pred))
print("Classification Report:\n", classification_report(y_test, test_pred, target_names=iris.target_names))

원본 데이터 형태: (150, 5)
결측치 개수:
 sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
label                0
dtype: int64
분할: (90, 4) (30, 4) (30, 4)
[기준모델] Validation Accuracy: 0.9333
[탐색] Best params: {'max_depth': None, 'min_samples_split': 2} Val Acc: 0.9333
[최종모델] Test Accuracy: 0.9333
Confusion Matrix:
 [[10  0  0]
 [ 0  9  1]
 [ 0  1  9]]
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.90      0.90      0.90        10
   virginica       0.90      0.90      0.90        10

    accuracy                           0.93        30
   macro avg       0.93      0.93      0.93        30
weighted avg       0.93      0.93      0.93        30



In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler






from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

