# 02_train
- Author: 
- Date: 2025-10-23
- Goal: 모델링/학습/통계/시각화
- Input: 
- Output: 
- Metrics: acc@val, loss@train
- Repro: seed=42, device=auto, config=../configs/


In [24]:
# ============================================================
# 02_train_classical_ml.ipynb
#
# Classical ML on Colored MNIST  (NO Neural Networks)
#
# Models (ALL REQUIRED):
#   - KNN
#   - Decision Tree
#   - Random Forest
#   - XGBoost
#
# Objectives:
#   1) Use preprocessed Colored MNIST from 01_preprocessing_colored_mnist.ipynb
#   2) Clear split:
#        - Train: 모델 학습
#        - Val  : 학습 상태/튜닝/학습곡선용 내부 검증
#        - Test : 최종 일반화 성능 평가
#   3) Save trained models (.joblib) so we don't retrain every run
#   4) Generate:
#        - Metrics (train/val/test)
#        - Confusion matrices (val/test)
#        - Learning curves (train vs val)
#        - Feature importance maps (RF, XGB)
#
# Notes:
#   - No Neural Networks, No SVM, No Logistic Regression
#   - KNN uses scaled features
#   - Tree / RF / XGB use raw features
#   - All plots use English labels only
# ============================================================

In [25]:
# ------------------------------------------------------------
# [Cell 1] Imports & environment setup
# ------------------------------------------------------------

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
)

from sklearn.model_selection import train_test_split

import joblib  # for saving/loading trained models

# XGBoost: 필수. 없으면 자동 설치.
try:
    from xgboost import XGBClassifier
    print("[OK] XGBoost is already installed.")
except ImportError:
    print("[INFO] xgboost not found. Installing...")
    import subprocess, sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost"])
    from xgboost import XGBClassifier
    print("[OK] XGBoost installed.")

# Plot settings (English labels only)
plt.rcParams["font.family"] = "DejaVu Sans"
plt.rcParams["axes.unicode_minus"] = False

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("[OK] Environment ready.")

[OK] XGBoost is already installed.
[OK] Environment ready.


In [26]:
# ------------------------------------------------------------
# [Cell 2] Path configuration
#   - notebooks/ 또는 repo root 어디에서 실행해도 동작
#   - results/ 디렉토리들 준비 (Git 추적 X 권장)
# ------------------------------------------------------------

cwd = os.getcwd()

if os.path.basename(cwd) == "notebooks":
    BASE_DIR = os.path.dirname(cwd)
else:
    BASE_DIR = cwd

NPZ_PATH = os.path.join(BASE_DIR, "data", "processed", "colored_mnist", "colored_mnist.npz")

RESULTS_DIR = os.path.join(BASE_DIR, "results")
RESULTS_METRICS_DIR = os.path.join(RESULTS_DIR, "metrics")
RESULTS_FIGURES_DIR = os.path.join(RESULTS_DIR, "figures")
RESULTS_MODELS_DIR = os.path.join(RESULTS_DIR, "models")

os.makedirs(RESULTS_METRICS_DIR, exist_ok=True)
os.makedirs(RESULTS_FIGURES_DIR, exist_ok=True)
os.makedirs(RESULTS_MODELS_DIR, exist_ok=True)

print("[INFO] BASE_DIR        :", BASE_DIR)
print("[INFO] NPZ_PATH        :", NPZ_PATH)
print("[INFO] RESULTS_DIR     :", RESULTS_DIR)
print("[INFO] METRICS_DIR     :", RESULTS_METRICS_DIR)
print("[INFO] FIGURES_DIR     :", RESULTS_FIGURES_DIR)
print("[INFO] MODELS_DIR      :", RESULTS_MODELS_DIR)

[INFO] BASE_DIR        : /Users/jaehun_jung/colored-mnist-classification
[INFO] NPZ_PATH        : /Users/jaehun_jung/colored-mnist-classification/data/processed/colored_mnist/colored_mnist.npz
[INFO] RESULTS_DIR     : /Users/jaehun_jung/colored-mnist-classification/results
[INFO] METRICS_DIR     : /Users/jaehun_jung/colored-mnist-classification/results/metrics
[INFO] FIGURES_DIR     : /Users/jaehun_jung/colored-mnist-classification/results/figures
[INFO] MODELS_DIR      : /Users/jaehun_jung/colored-mnist-classification/results/models


In [27]:
# ------------------------------------------------------------
# [Cell 3] Load preprocessed dataset
#   - 01_preprocessing_colored_mnist.ipynb 결과 사용
#   - 이미 1차 train/test split 완료된 상태
# ------------------------------------------------------------

if not os.path.exists(NPZ_PATH):
    raise FileNotFoundError(
        f"[ERROR] {NPZ_PATH} not found.\n"
        "Run 01_preprocessing_colored_mnist.ipynb first."
    )

data = np.load(NPZ_PATH)

# Scaled features: KNN 등 거리 기반용
X_train = data["X_train"]
X_test = data["X_test"]

# Raw features: Tree / RF / XGB용
X_train_raw = data["X_train_raw"]
X_test_raw = data["X_test_raw"]

# Labels
y_digit_train = data["y_digit_train"]
y_digit_test = data["y_digit_test"]
y_fg_train = data["y_fg_train"]
y_fg_test = data["y_fg_test"]
y_bg_train = data["y_bg_train"]
y_bg_test = data["y_bg_test"]

# Consistency checks
assert X_train.shape[0] == y_digit_train.shape[0] == y_fg_train.shape[0] == y_bg_train.shape[0]
assert X_test.shape[0] == y_digit_test.shape[0] == y_fg_test.shape[0] == y_bg_test.shape[0]

print("[OK] Loaded colored_mnist.npz")
print("  X_train     :", X_train.shape)
print("  X_test      :", X_test.shape)
print("  X_train_raw :", X_train_raw.shape)
print("  X_test_raw  :", X_test_raw.shape)
print("  y_digit     :", y_digit_train.shape, y_digit_test.shape)
print("  y_fg        :", y_fg_train.shape, y_fg_test.shape)
print("  y_bg        :", y_bg_train.shape, y_bg_test.shape)

[OK] Loaded colored_mnist.npz
  X_train     : (67093, 2352)
  X_test      : (16774, 2352)
  X_train_raw : (67093, 2352)
  X_test_raw  : (16774, 2352)
  y_digit     : (67093,) (16774,)
  y_fg        : (67093,) (16774,)
  y_bg        : (67093,) (16774,)


In [28]:
# ------------------------------------------------------------
# [Cell 4] Task configuration
#   - 3개 Task를 공통 구조로 처리하기 위한 dict 정의
# ------------------------------------------------------------

COLOR_NAMES = ["RED", "ORANGE", "YELLOW", "GREEN", "BLUE", "INDIGO", "VIOLET"]

tasks = {
    "digit": {
        "y_train": y_digit_train,
        "y_test": y_digit_test,
        "class_names": [str(i) for i in range(10)],
    },
    "fg_color": {
        "y_train": y_fg_train,
        "y_test": y_fg_test,
        "class_names": COLOR_NAMES,
    },
    "bg_color": {
        "y_train": y_bg_train,
        "y_test": y_bg_test,
        "class_names": COLOR_NAMES,
    },
}

print("[OK] Tasks:", list(tasks.keys()))

[OK] Tasks: ['digit', 'fg_color', 'bg_color']


In [29]:
# ------------------------------------------------------------
# [Cell 5] Model factory
#   - 공통 하이퍼파라미터 설정
#   - use_raw_features: 어떤 X를 쓸지 명시
# ------------------------------------------------------------

def get_models(random_state: int = RANDOM_SEED):
    """
    반환:
      models[model_name] = (model_instance, use_raw_features_flag)
    """
    models = {}

    # KNN (scaled)
    models["KNN"] = (
        KNeighborsClassifier(
            n_neighbors=7,
            weights="distance",
            n_jobs=-1,
        ),
        False,
    )

    # Decision Tree (raw)
    models["DecisionTree"] = (
        DecisionTreeClassifier(
            max_depth=25,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=random_state,
        ),
        True,
    )

    # Random Forest (raw)
    models["RandomForest"] = (
        RandomForestClassifier(
            n_estimators=300,
            max_depth=30,
            min_samples_split=4,
            min_samples_leaf=2,
            n_jobs=-1,
            random_state=random_state,
        ),
        True,
    )

    # XGBoost (raw)
    models["XGBoost"] = (
        XGBClassifier(
            objective="multi:softprob",
            # num_class는 Task별로 동적으로 지정
            n_estimators=400,
            max_depth=8,
            learning_rate=0.1,
            subsample=0.9,
            colsample_bytree=0.9,
            tree_method="hist",
            eval_metric="mlogloss",
            n_jobs=-1,
            random_state=random_state,
        ),
        True,
    )

    return models

print("[OK] Models defined:", list(get_models().keys()))

[OK] Models defined: ['KNN', 'DecisionTree', 'RandomForest', 'XGBoost']


In [30]:
# ------------------------------------------------------------
# [Cell 6] Config: model saving & control
#   - USE_SAVED_MODELS:
#       True  → 저장된 모델이 있으면 로드해서 사용 (test 평가만 다시)
#       False → 항상 새로 학습
#   - FORCE_RETRAIN:
#       True  → 저장된 모델 무시하고 항상 재학습 후 덮어쓰기
# ------------------------------------------------------------

USE_SAVED_MODELS = True
FORCE_RETRAIN = False

print("[CONFIG] USE_SAVED_MODELS:", USE_SAVED_MODELS)
print("[CONFIG] FORCE_RETRAIN   :", FORCE_RETRAIN)

[CONFIG] USE_SAVED_MODELS: True
[CONFIG] FORCE_RETRAIN   : False


In [31]:
# ------------------------------------------------------------
# [Cell 7] Utility: metrics, confusion matrix, learning curve
# ------------------------------------------------------------

def compute_metrics(y_true, y_pred):
    """Compute Accuracy, Precision, Recall, F1 (macro)."""
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average="macro", zero_division=0)
    rec = recall_score(y_true, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="macro", zero_division=0)
    return acc, prec, rec, f1


def plot_confusion_matrix(cm, class_names, title, save_path):
    """Draw and save confusion matrix heatmap."""
    plt.figure(figsize=(6, 5))
    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=class_names,
        yticklabels=class_names,
        cbar=False,
    )
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(save_path, dpi=200)
    plt.close()
    print(f"[OK] Saved CM → {save_path}")


def plot_learning_curve_sizes(
    model_name,
    base_model,
    use_raw,
    X_train_scaled,
    X_train_raw,
    y_train,
    X_val_scaled,
    X_val_raw,
    y_val,
    task_name,
    save_dir,
    random_state=RANDOM_SEED,
):
    """
    간단한 학습곡선(learning curve) 생성:
      - train_size 비율 [0.2, 0.4, 0.6, 0.8, 1.0]
      - 각 비율마다
          1) 해당 크기의 train subset으로 학습
          2) train subset 성능, val 성능 계산
      - 결과를 하나의 plot으로 저장
    """
    train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
    train_scores = []
    val_scores = []

    # 사용할 feature 선택
    X_full = X_train_raw if use_raw else X_train_scaled
    X_val = X_val_raw if use_raw else X_val_scaled

    n_samples = X_full.shape[0]

    for frac in train_sizes:
        size = max(100, int(n_samples * frac))  # 최소 100개는 사용
        idx = np.random.choice(n_samples, size=size, replace=False)

        X_sub = X_full[idx]
        y_sub = y_train[idx]

        # 모델 복사 생성
        if model_name == "KNN":
            model = KNeighborsClassifier(
                n_neighbors=base_model.n_neighbors,
                weights=base_model.weights,
                n_jobs=-1,
            )
        elif model_name == "DecisionTree":
            model = DecisionTreeClassifier(
                max_depth=base_model.max_depth,
                min_samples_split=base_model.min_samples_split,
                min_samples_leaf=base_model.min_samples_leaf,
                random_state=random_state,
            )
        elif model_name == "RandomForest":
            model = RandomForestClassifier(
                n_estimators=base_model.n_estimators,
                max_depth=base_model.max_depth,
                min_samples_split=base_model.min_samples_split,
                min_samples_leaf=base_model.min_samples_leaf,
                n_jobs=-1,
                random_state=random_state,
            )
        elif model_name == "XGBoost":
            model = XGBClassifier(
                objective="multi:softprob",
                num_class=len(np.unique(y_train)),
                n_estimators=base_model.n_estimators,
                max_depth=base_model.max_depth,
                learning_rate=base_model.learning_rate,
                subsample=base_model.subsample,
                colsample_bytree=base_model.colsample_bytree,
                tree_method=base_model.tree_method,
                eval_metric=base_model.eval_metric,
                n_jobs=-1,
                random_state=random_state,
            )
        else:
            continue

        # 학습
        model.fit(X_sub, y_sub)

        # train subset 성능
        if model_name == "XGBoost":
            y_sub_pred = np.argmax(model.predict_proba(X_sub), axis=1)
        else:
            y_sub_pred = model.predict(X_sub)
        _, _, _, f1_tr = compute_metrics(y_sub, y_sub_pred)

        # val 성능
        if model_name == "XGBoost":
            y_val_pred = np.argmax(model.predict_proba(X_val), axis=1)
        else:
            y_val_pred = model.predict(X_val)
        _, _, _, f1_val = compute_metrics(y_val, y_val_pred)

        train_scores.append(f1_tr)
        val_scores.append(f1_val)

    # Plot
    plt.figure(figsize=(6, 4))
    plt.plot(train_sizes, train_scores, marker="o", label="Train F1 (subset)")
    plt.plot(train_sizes, val_scores, marker="o", label="Val F1")
    plt.xlabel("Train size fraction")
    plt.ylabel("Macro F1")
    plt.ylim(0.0, 1.05)
    plt.title(f"Learning Curve - {task_name} - {model_name}")
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()

    save_path = os.path.join(save_dir, f"lc_{task_name}_{model_name}.png")
    plt.savefig(save_path, dpi=200)
    plt.close()
    print(f"[OK] Saved learning curve → {save_path}")

In [32]:
# ------------------------------------------------------------
# [Cell 8] Train/Val/Test workflow with model saving
#
# For each task & model:
#   1) Split TRAIN into (train_sub, val)
#   2) Fit model on train_sub → eval on train_sub & val
#      - store metrics (set=train, val)
#      - save CM for val
#      - generate learning curve (train vs val)
#   3) Train or load FINAL model on FULL train → eval on test
#      - save model (.joblib)
#      - store metrics (set=test)
#      - save CM for test
# ------------------------------------------------------------

models_template = get_models()
all_results = []
trained_models = {task: {} for task in tasks.keys()}

VAL_RATIO = 0.2  # train -> (train_sub 80%, val 20%)

for task_name, tinfo in tasks.items():
    print(f"\n==================== Task: {task_name} ====================")

    y_train_all = tinfo["y_train"]
    y_test = tinfo["y_test"]
    class_names = tinfo["class_names"]
    num_classes = len(class_names)

    n_train = X_train.shape[0]
    indices = np.arange(n_train)

    # --- Task별로 stratified train/val split 인덱스 생성 ---
    train_idx, val_idx = train_test_split(
        indices,
        test_size=VAL_RATIO,
        random_state=RANDOM_SEED,
        stratify=y_train_all,
    )

    # scaled features
    X_tr_scaled = X_train[train_idx]
    X_val_scaled = X_train[val_idx]

    # raw features
    X_tr_raw = X_train_raw[train_idx]
    X_val_raw = X_train_raw[val_idx]

    y_tr = y_train_all[train_idx]
    y_val = y_train_all[val_idx]

    print(f"[INFO] Train_sub: {X_tr_scaled.shape[0]} / Val: {X_val_scaled.shape[0]}")

    for model_name, (base_model, use_raw) in models_template.items():
        print(f"\n[MODEL] {model_name} (use_raw_features={use_raw})")

        # ---------- 1) Train_sub / Val 학습 및 평가 ----------
        # 학습에 사용할 feature 선택
        Xtr_sub = X_tr_raw if use_raw else X_tr_scaled
        Xval = X_val_raw if use_raw else X_val_scaled

        # base_model을 복사하여 task-specific model 생성
        if model_name == "KNN":
            model_sub = KNeighborsClassifier(
                n_neighbors=base_model.n_neighbors,
                weights=base_model.weights,
                n_jobs=-1,
            )
        elif model_name == "DecisionTree":
            model_sub = DecisionTreeClassifier(
                max_depth=base_model.max_depth,
                min_samples_split=base_model.min_samples_split,
                min_samples_leaf=base_model.min_samples_leaf,
                random_state=RANDOM_SEED,
            )
        elif model_name == "RandomForest":
            model_sub = RandomForestClassifier(
                n_estimators=base_model.n_estimators,
                max_depth=base_model.max_depth,
                min_samples_split=base_model.min_samples_split,
                min_samples_leaf=base_model.min_samples_leaf,
                n_jobs=-1,
                random_state=RANDOM_SEED,
            )
        elif model_name == "XGBoost":
            model_sub = XGBClassifier(
                objective="multi:softprob",
                num_class=num_classes,
                n_estimators=base_model.n_estimators,
                max_depth=base_model.max_depth,
                learning_rate=base_model.learning_rate,
                subsample=base_model.subsample,
                colsample_bytree=base_model.colsample_bytree,
                tree_method=base_model.tree_method,
                eval_metric=base_model.eval_metric,
                n_jobs=-1,
                random_state=RANDOM_SEED,
            )
        else:
            raise ValueError(f"Unknown model: {model_name}")

        # Train on train_sub
        print("[TRAIN] Fitting on train_sub for train/val evaluation...")
        model_sub.fit(Xtr_sub, y_tr)

        # Train_sub performance
        if model_name == "XGBoost":
            y_tr_pred = np.argmax(model_sub.predict_proba(Xtr_sub), axis=1)
        else:
            y_tr_pred = model_sub.predict(Xtr_sub)
        acc_tr, prec_tr, rec_tr, f1_tr = compute_metrics(y_tr, y_tr_pred)

        all_results.append({
            "task": task_name,
            "model": model_name,
            "set": "train_sub",
            "accuracy": acc_tr,
            "precision_macro": prec_tr,
            "recall_macro": rec_tr,
            "f1_macro": f1_tr,
        })

        # Val performance
        if model_name == "XGBoost":
            y_val_pred = np.argmax(model_sub.predict_proba(Xval), axis=1)
        else:
            y_val_pred = model_sub.predict(Xval)
        acc_v, prec_v, rec_v, f1_v = compute_metrics(y_val, y_val_pred)

        print(f"[VAL]  Acc={acc_v:.4f}  Prec={prec_v:.4f}  Rec={rec_v:.4f}  F1={f1_v:.4f}")

        all_results.append({
            "task": task_name,
            "model": model_name,
            "set": "val",
            "accuracy": acc_v,
            "precision_macro": prec_v,
            "recall_macro": rec_v,
            "f1_macro": f1_v,
        })

        # Val Confusion Matrix 저장
        cm_val = confusion_matrix(y_val, y_val_pred)
        cm_val_path = os.path.join(
            RESULTS_FIGURES_DIR,
            f"cm_val_{task_name}_{model_name}.png",
        )
        plot_confusion_matrix(
            cm_val,
            class_names,
            f"CM (Val) - {task_name} - {model_name}",
            cm_val_path,
        )

        # Learning Curve 저장
        plot_learning_curve_sizes(
            model_name,
            base_model,
            use_raw,
            X_tr_scaled,
            X_tr_raw,
            y_tr,
            X_val_scaled,
            X_val_raw,
            y_val,
            task_name,
            RESULTS_FIGURES_DIR,
            random_state=RANDOM_SEED,
        )

        # ---------- 2) Final model: FULL train으로 학습 or 로드 후 TEST 평가 ----------
        final_model_path = os.path.join(
            RESULTS_MODELS_DIR,
            f"{task_name}_{model_name}.joblib",
        )

        # feature for full train / test
        Xtrain_full = X_train_raw if use_raw else X_train
        Xtest_full = X_test_raw if use_raw else X_test

        if USE_SAVED_MODELS and os.path.exists(final_model_path) and not FORCE_RETRAIN:
            # 기존 학습된 최종 모델 로드
            final_model = joblib.load(final_model_path)
            print(f"[LOAD] Loaded final model from {final_model_path}")
        else:
            # 새 최종 모델 학습 (train_sub + val → 즉 전체 train 사용)
            if model_name == "KNN":
                final_model = KNeighborsClassifier(
                    n_neighbors=base_model.n_neighbors,
                    weights=base_model.weights,
                    n_jobs=-1,
                )
            elif model_name == "DecisionTree":
                final_model = DecisionTreeClassifier(
                    max_depth=base_model.max_depth,
                    min_samples_split=base_model.min_samples_split,
                    min_samples_leaf=base_model.min_samples_leaf,
                    random_state=RANDOM_SEED,
                )
            elif model_name == "RandomForest":
                final_model = RandomForestClassifier(
                    n_estimators=base_model.n_estimators,
                    max_depth=base_model.max_depth,
                    min_samples_split=base_model.min_samples_split,
                    min_samples_leaf=base_model.min_samples_leaf,
                    n_jobs=-1,
                    random_state=RANDOM_SEED,
                )
            elif model_name == "XGBoost":
                final_model = XGBClassifier(
                    objective="multi:softprob",
                    num_class=num_classes,
                    n_estimators=base_model.n_estimators,
                    max_depth=base_model.max_depth,
                    learning_rate=base_model.learning_rate,
                    subsample=base_model.subsample,
                    colsample_bytree=base_model.colsample_bytree,
                    tree_method=base_model.tree_method,
                    eval_metric=base_model.eval_metric,
                    n_jobs=-1,
                    random_state=RANDOM_SEED,
                )
            else:
                raise ValueError(f"Unknown model: {model_name}")

            print("[TRAIN] Fitting FINAL model on FULL train set...")
            final_model.fit(Xtrain_full, y_train_all)
            joblib.dump(final_model, final_model_path)
            print(f"[SAVE] Saved final model → {final_model_path}")

        # Test set 평가
        if model_name == "XGBoost":
            y_test_pred = np.argmax(final_model.predict_proba(Xtest_full), axis=1)
        else:
            y_test_pred = final_model.predict(Xtest_full)

        acc_te, prec_te, rec_te, f1_te = compute_metrics(y_test, y_test_pred)

        print(f"[TEST] Acc={acc_te:.4f}  Prec={prec_te:.4f}  Rec={rec_te:.4f}  F1={f1_te:.4f}")

        all_results.append({
            "task": task_name,
            "model": model_name,
            "set": "test",
            "accuracy": acc_te,
            "precision_macro": prec_te,
            "recall_macro": rec_te,
            "f1_macro": f1_te,
        })

        # Test Confusion Matrix
        cm_test = confusion_matrix(y_test, y_test_pred)
        cm_test_path = os.path.join(
            RESULTS_FIGURES_DIR,
            f"cm_test_{task_name}_{model_name}.png",
        )
        plot_confusion_matrix(
            cm_test,
            class_names,
            f"CM (Test) - {task_name} - {model_name}",
            cm_test_path,
        )

        # 최종 모델 저장 (importance에서 사용)
        trained_models[task_name][model_name] = final_model

print("\n[OK] Train_sub/Val/Test evaluation completed for all models.")


[INFO] Train_sub: 53674 / Val: 13419

[MODEL] KNN (use_raw_features=False)
[TRAIN] Fitting on train_sub for train/val evaluation...
[VAL]  Acc=0.8116  Prec=0.8372  Rec=0.8084  F1=0.8140
[OK] Saved CM → /Users/jaehun_jung/colored-mnist-classification/results/figures/cm_val_digit_KNN.png
[OK] Saved learning curve → /Users/jaehun_jung/colored-mnist-classification/results/figures/lc_digit_KNN.png
[LOAD] Loaded final model from /Users/jaehun_jung/colored-mnist-classification/results/models/digit_KNN.joblib
[TEST] Acc=0.8219  Prec=0.8442  Rec=0.8191  F1=0.8241
[OK] Saved CM → /Users/jaehun_jung/colored-mnist-classification/results/figures/cm_test_digit_KNN.png

[MODEL] DecisionTree (use_raw_features=True)
[TRAIN] Fitting on train_sub for train/val evaluation...
[VAL]  Acc=0.4820  Prec=0.6170  Rec=0.4766  F1=0.5060
[OK] Saved CM → /Users/jaehun_jung/colored-mnist-classification/results/figures/cm_val_digit_DecisionTree.png
[OK] Saved learning curve → /Users/jaehun_jung/colored-mnist-classifi

KeyboardInterrupt: 

In [None]:
# ------------------------------------------------------------
# [Cell 9] Save metrics summary
#   - train_sub / val / test 모두 포함
#   - 03_analysis_report.ipynb에서 이 파일을 읽어 분석
# ------------------------------------------------------------

results_df = pd.DataFrame(all_results)

# 정렬: task → set → f1_macro (내림차순)
results_df = results_df.sort_values(
    by=["task", "set", "f1_macro"],
    ascending=[True, True, False],
)

summary_path = os.path.join(RESULTS_METRICS_DIR, "classical_ml_summary.csv")
results_df.to_csv(summary_path, index=False)

print("[OK] Saved full metrics summary →", summary_path)
display(results_df.head())

In [None]:
# ------------------------------------------------------------
# [Cell 10] Feature importance (RF & XGB, using final models)
#   - Test에 사용된 최종 모델 기반으로 중요도 시각화
# ------------------------------------------------------------

for task_name, tinfo in tasks.items():
    class_names = tinfo["class_names"]

    print(f"\n==================== Feature Importance: {task_name} ====================")

    # RandomForest
    rf_model = trained_models[task_name].get("RandomForest", None)
    if rf_model is not None and hasattr(rf_model, "feature_importances_"):
        rf_imp = rf_model.feature_importances_
        rf_path = os.path.join(
            RESULTS_FIGURES_DIR,
            f"fi_{task_name}_rf.png",
        )
        plot_importance_map(
            rf_imp,
            f"RF importance - {task_name}",
            rf_path,
        )
    else:
        print("[INFO] RandomForest not available or no feature_importances_.")

    # XGBoost
    xgb_model = trained_models[task_name].get("XGBoost", None)
    if xgb_model is not None and hasattr(xgb_model, "feature_importances_"):
        xgb_imp = xgb_model.feature_importances_
        xgb_path = os.path.join(
            RESULTS_FIGURES_DIR,
            f"fi_{task_name}_xgb.png",
        )
        plot_importance_map(
            xgb_imp,
            f"XGB importance - {task_name}",
            xgb_path,
        )
    else:
        print("[INFO] XGBoost not available or no feature_importances_.")

print("\n✅ Finished 02_train_classical_ml.ipynb")
print("   - Models: saved in results/models/")
print("   - Metrics: results/metrics/classical_ml_summary.csv")
print("   - Figures: results/figures/ (CM, LC, FI)")
print("   - Next: use 03_analysis_report.ipynb for narrative analysis.")