In [1]:
import inspect
from contextlib import contextmanager

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from joblib import parallel as joblib_parallel

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import (
    train_test_split, StratifiedShuffleSplit, GridSearchCV,
    RandomizedSearchCV, ParameterGrid
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, make_scorer
)

In [3]:
df = pd.read_csv("forestCover.csv", na_values=["?"])

# --- 1. Shape ---
print("Shape:", df.shape)

# --- 2. Missing values ---
print("\nMissing values per column:")
print(100 *(df.isna().sum()[df.isna().sum() > 0] / df.shape[0]))

# --- 3. Cardinality (unique values) ---
print("\nCardinality per column:")
card = df.nunique()
print(card.sort_values())

# --- 4. Correlation check (numeric only) ---
print("\nTop correlations:")
corr = df.corr(numeric_only=True).abs()
high_corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
print(high_corr.stack().sort_values(ascending=False).head(10))

# --- 5. Check for invalid / non-numeric in supposed numeric columns ---
for col in df.columns:
    if df[col].dtype == "object":
        print(f"\nColumn '{col}' has non-numeric entries, e.g.:")
        print(df[col].value_counts().head())

# --- 6. Range differences ---
print("\nNumeric range differences:")
for col in df.select_dtypes(include=[np.number]):
    print(f"{col}: min={df[col].min()}, max={df[col].max()}, mean={df[col].mean()}")

# --- 7. Outlier detection (simple IQR rule) ---
print("\nPotential outliers by column:")
for col in df.select_dtypes(include=[np.number]):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    outliers = ((df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))).sum()
    if outliers > 0:
        print(f"{col}: {outliers} potential outliers")

# --- 8. Target distribution ---
target = df.columns[-1]
print(f"\nTarget '{target}' distribution:")
print(df[target].value_counts(dropna=False, normalize=True))

Shape: (581012, 59)

Missing values per column:
Slope    0.05129
dtype: float64

Cardinality per column:
Observation_ID                             1
Soil_Type14                                2
Soil_Type9                                 2
Soil_Type10                                2
Soil_Type11                                2
Soil_Type12                                2
Soil_Type13                                2
Soil_Type15                                2
Soil_Type16                                2
Soil_Type17                                2
Soil_Type18                                2
Soil_Type19                                2
Soil_Type20                                2
Soil_Type21                                2
Soil_Type22                                2
Soil_Type23                                2
Soil_Type24                                2
Soil_Type25                                2
Soil_Type26                                2
Soil_Type27                             

In [5]:


def make_ohe_dense():
    params = inspect.signature(OneHotEncoder).parameters
    if "sparse_output" in params:              # sklearn >= 1.2
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    else:                                      # sklearn < 1.2
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

# ---------- Load data ----------
df = pd.read_csv("forestCover.csv", na_values=["?"])

target_col = df.columns[-1]
y = df[target_col]

# ---------- Helper to resolve column names robustly ----------
def find_col(df, candidates):
    norm = {c.lower().replace(" ", "").replace("_", ""): c for c in df.columns}
    for cand in candidates:
        key = cand.lower().replace(" ", "").replace("_", "")
        if key in norm:
            return norm[key]
    return None

COL_ID          = find_col(df, ["Observation ID", "ObservationID", "ObsID", "ID"])
COL_WATER       = find_col(df, ["Water Level", "WaterLevel"])
COL_FACET       = find_col(df, ["Facet"])
COL_ASPECT      = find_col(df, ["Aspect"])
COL_INCLINATION = find_col(df, ["Inclination"])

# ---------- kNN: apply ONLY the necessary transforms ----------
# Drop: ID (unique), Water Level (constant), Facet (redundant with Aspect), Inclination (noise-only)
to_drop_knn = [c for c in [COL_ID, COL_WATER, COL_FACET, COL_INCLINATION] if c in df.columns]
X_knn_base = df.drop(columns=[target_col] + to_drop_knn)

# Identify column types after drops
numeric_cols_knn = X_knn_base.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols_knn = [c for c in X_knn_base.columns if c not in numeric_cols_knn]

knn_preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", RobustScaler())
        ]), numeric_cols_knn),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", make_ohe_dense())       # <-- changed here
        ]), categorical_cols_knn)
    ],
    remainder="drop"
)

X_knn_ready = knn_preprocessor.fit_transform(X_knn_base)

# ---------- Classification Trees: ONLY necessary transforms ----------
# Drop: ID (unique), Water Level (constant). Keep Facet+Aspect (redundancy benign). Keep Inclination (noise tolerated).
to_drop_tree = [c for c in [COL_ID, COL_WATER] if c in df.columns]
X_tree_base = df.drop(columns=[target_col] + to_drop_tree)

numeric_cols_tree = X_tree_base.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols_tree = [c for c in X_tree_base.columns if c not in numeric_cols_tree]

tree_preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median"))
        ]), numeric_cols_tree),
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", make_ohe_dense()) 
        ]), categorical_cols_tree)
    ],
    remainder="drop"
)

X_tree_ready = tree_preprocessor.fit_transform(X_tree_base)

In [6]:
knn_feature_names = knn_preprocessor.get_feature_names_out()
X_knn_df = pd.DataFrame(X_knn_ready, columns=knn_feature_names, index=X_knn_base.index)

print("kNN processed DataFrame:")
print(X_knn_df.head())

tree_feature_names = tree_preprocessor.get_feature_names_out()
X_tree_df = pd.DataFrame(X_tree_ready, columns=tree_feature_names, index=X_tree_base.index)

print("\nTree processed DataFrame:")
print(X_tree_df.head())#


kNN processed DataFrame:
   num__Elevation  num__Aspect  num__Slope  \
0       -0.259887     0.193069   -0.777778   
1       -1.333333     0.049505   -0.444444   
2        0.189266     1.138614   -0.444444   
3        0.096045     0.103960   -0.777778   
4       -0.635593    -0.014851   -0.222222   

   num__Horizontal_Distance_To_Hydrology  num__Vertical_Distance_To_Hydrology  \
0                               0.688406                             0.516129   
1                              -0.681159                            -0.451613   
2                               0.521739                            -0.241935   
3                               0.105072                             0.322581   
4                              -0.173913                            -0.177419   

   num__Horizontal_Distance_To_Roadways  num__Hillshade_9am  \
0                              0.513051            0.212121   
1                             -0.430693            0.515152   
2                     

In [54]:


@contextmanager
def tqdm_joblib(tqdm_object):
    """Patch joblib to report into tqdm progress bar."""
    OriginalBatchCompletionCallBack = joblib_parallel.BatchCompletionCallBack
    class TqdmBatchCompletionCallBack(OriginalBatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size) 
            return super().__call__(*args, **kwargs)
    joblib_parallel.BatchCompletionCallBack = TqdmBatchCompletionCallBack
    try:
        yield tqdm_object
    finally:
        joblib_parallel.BatchCompletionCallBack = OriginalBatchCompletionCallBack
        tqdm_object.close()

# =========================
# 1) Configuration knobs
# =========================
RANDOM_SEED     = 42
TEST_SIZE       = 0.30     # final, untouched test split
TUNE_SIZE       = 30_000   #tuning subset size (from TRAIN ONLY)
CV_FOLDS        = 10        # CV folds for both searches
N_RUNS          = 10       # repeated independent runs for mean ± std
RUN_SUBSET_SIZE = 75_000   # per-run subset size (independent from tuning subset)

rng_global = np.random.default_rng(RANDOM_SEED)

# ===========================================================
# 2) Common train/test split (works for BOTH feature spaces)
# ===========================================================
X_idx = np.arange(len(y))
idx_train, idx_test, y_train, y_test = train_test_split(
    X_idx, y, test_size=TEST_SIZE, random_state=RANDOM_SEED, stratify=y
)

X_knn_train = X_knn_df.iloc[idx_train]
X_knn_test  = X_knn_df.iloc[idx_test]
X_tree_train= X_tree_df.iloc[idx_train]
X_tree_test = X_tree_df.iloc[idx_test]

# ===========================================================
# 3) Build a stratified tuning subset from TRAIN ONLY
# ===========================================================
if len(y_train) <= TUNE_SIZE:
    tune_mask = np.arange(len(y_train))
else:
    sss_tune = StratifiedShuffleSplit(
        n_splits=1,
        test_size=(len(y_train) - TUNE_SIZE) / len(y_train),
        random_state=RANDOM_SEED + 1
    )
    (tune_mask, _), = sss_tune.split(X_knn_train, y_train)

X_knn_tune  = X_knn_train.iloc[tune_mask]
X_tree_tune = X_tree_train.iloc[tune_mask]
y_tune      = y_train.iloc[tune_mask]

scorer = make_scorer(f1_score, average="weighted", zero_division=0)

# ============================================
# 5) kNN: conditional grid + progress + search
# ============================================
knn = KNeighborsClassifier(algorithm="brute")  
knn_param_grid = [
    {  # minkowski branch (uses p)
        "n_neighbors": list(range(3, 14, 2)),       
        "weights": ["uniform", "distance"],
        "metric": ["minkowski"],
        "p": [3],
    },
    {  #euclidean/manhattan (no p)
        "n_neighbors": list(range(3, 14, 2)),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"],
    },
]
knn_grid = GridSearchCV(
    estimator=knn,
    param_grid=knn_param_grid,
    scoring=scorer,
    cv=CV_FOLDS,
    n_jobs=-1,
    verbose=0
)
n_candidates_knn = sum(len(list(ParameterGrid(pg))) for pg in knn_param_grid)
total_fits_knn   = n_candidates_knn * CV_FOLDS

with tqdm(total=total_fits_knn, desc="kNN GridSearch", unit="fit") as pbar:
    with tqdm_joblib(pbar):
        knn_grid.fit(X_knn_tune, y_tune)

print("\nBest kNN params:", knn_grid.best_params_)
print("Best kNN CV (F1-weighted):", round(knn_grid.best_score_, 4))

# ===================================
# 6) Decision Tree: randomized search
# ===================================
tree = DecisionTreeClassifier(random_state=RANDOM_SEED)
tree_param_dist = {
    "criterion": ["gini", "entropy"],
    "max_depth": [None, 5, 8, 10, 12, 16, 20, 30, 40, 60, 80, 100],
    "min_samples_split": [2, 5, 10, 20, 50],
    "min_samples_leaf": [1, 2, 5, 10, 20, 100, 1000, 10000],
    "max_features": [None, "sqrt", "log2"],
    "ccp_alpha": np.concatenate(([0.0], np.logspace(-5, -1, 9))),
    "class_weight": [None, "balanced"],
}
tree_rand = RandomizedSearchCV(
    estimator=tree,
    param_distributions=tree_param_dist,
    n_iter=120,         
    scoring=scorer,
    cv=CV_FOLDS,
    random_state=RANDOM_SEED + 2,
    n_jobs=-1,
    verbose=0
)
total_fits_tree = tree_rand.n_iter * CV_FOLDS

with tqdm(total=total_fits_tree, desc="Tree RandomSearch", unit="fit") as pbar:
    with tqdm_joblib(pbar):
        tree_rand.fit(X_tree_tune, y_tune)

print("\nBest Tree params:", tree_rand.best_params_)
print("Best Tree CV (F1-weighted):", round(tree_rand.best_score_, 4))

# ==========================================================
# 7) Refit best models on ALL TRAIN data (no test leakage)
# ==========================================================
best_knn = KNeighborsClassifier(algorithm="brute", **{k: v for k, v in knn_grid.best_params_.items() if k != "algorithm"})
best_knn.fit(X_knn_train, y_train)

best_tree = DecisionTreeClassifier(random_state=RANDOM_SEED, **tree_rand.best_params_)
best_tree.fit(X_tree_train, y_train)

# ======================================
# 8) Final evaluation on the TEST split
# ======================================
def evaluate(name, model, Xte, yte):
    yhat = model.predict(Xte)
    return {
        "name": name,
        "accuracy": accuracy_score(yte, yhat),
        "precision": precision_score(yte, yhat, average="weighted", zero_division=0),
        "recall": recall_score(yte, yhat, average="weighted", zero_division=0),
        "f1": f1_score(yte, yhat, average="weighted", zero_division=0),
        "yhat": yhat
    }

final_knn  = evaluate("kNN (final)", best_knn,  X_knn_test,  y_test)
final_tree = evaluate("Decision Tree (final)", best_tree, X_tree_test, y_test)

print("\n== Final test (common split) ==")
for res in [final_knn, final_tree]:
    print(f"{res['name']}: Acc={res['accuracy']:.4f}, Prec={res['precision']:.4f}, Rec={res['recall']:.4f}, F1={res['f1']:.4f}")

print("\nConfusion matrix (kNN):")
print(confusion_matrix(y_test, final_knn["yhat"]))
print("\nConfusion matrix (Tree):")
print(confusion_matrix(y_test, final_tree["yhat"]))
print("\nPer-class report (kNN):")
print(classification_report(y_test, final_knn["yhat"], zero_division=0))
print("\nPer-class report (Tree):")
print(classification_report(y_test, final_tree["yhat"], zero_division=0))

# ===========================================================
# 9) Cross-validation for mean ± std (no leakage)
# ===========================================================
from sklearn.model_selection import cross_validate

# Define scorers
scorers = {
    "accuracy": make_scorer(accuracy_score),
    "precision": make_scorer(precision_score, average="weighted", zero_division=0),
    "recall": make_scorer(recall_score, average="weighted", zero_division=0),
    "f1": make_scorer(f1_score, average="weighted", zero_division=0),
}

# Run CV for kNN
cv_knn = cross_validate(
    best_knn, X_knn_train, y_train,
    cv=CV_FOLDS,
    scoring=scorers,
    n_jobs=-1,
    return_train_score=False
)

# Run CV for Tree
cv_tree = cross_validate(
    best_tree, X_tree_train, y_train,
    cv=CV_FOLDS,
    scoring=scorers,
    n_jobs=-1,
    return_train_score=False
)

def summarize_cv(cv_res, name):
    df = pd.DataFrame(cv_res)
    mean_std = df.agg(["mean","std"]).round(4).T
    print(f"\n=== {name} CV Results (mean ± std across {CV_FOLDS} folds) ===")
    print(mean_std)

summarize_cv(cv_knn, "kNN")
summarize_cv(cv_tree, "Decision Tree")

# ---------------------------
# 10) Aggregate Mean ± Std
# ---------------------------
def paired_tests(cv_a, cv_b, metric_key):
    a = np.asarray(cv_a[f"test_{metric_key}"])
    b = np.asarray(cv_b[f"test_{metric_key}"])
    diff = a - b
    t = ttest_rel(a, b)
    try:
        w = wilcoxon(a, b, zero_method="wilcox")
        w_str = f"W={w.statistic:.3f}, p={w.pvalue:.3e}"
    except ValueError:
        w_str = "Wilcoxon skipped (all diffs zero)"
    print(f"\n=== Paired tests across {CV_FOLDS} CV folds ({metric_key}) ===")
    print(f"Observed mean diff (kNN - Tree): {diff.mean():.4f} ± {diff.std(ddof=1):.4f}")
    print(f"Paired t-test: t={t.statistic:.3f}, p={t.pvalue:.3e}")
    print(f"Wilcoxon: {w_str}")

paired_tests(cv_knn, cv_tree, "accuracy")
paired_tests(cv_knn, cv_tree, "f1")
# ==========================================
# 11) Paired tests across runs (kNN vs Tree)
# ==========================================
from scipy.stats import ttest_rel, wilcoxon

def paired_vec(metric):
    a = df_results.query("model=='kNN'").sort_values("run")[metric].to_numpy()
    b = df_results.query("model=='Tree'").sort_values("run")[metric].to_numpy()
    return a, b

for metric in ["accuracy", "f1"]:
    a, b = paired_vec(metric)
    diff = a - b
    t = ttest_rel(a, b)
    try:
        w = wilcoxon(a, b, zero_method="wilcox")
        w_str = f"W={w.statistic:.3f}, p={w.pvalue:.3e}"
    except ValueError:
        w_str = "Wilcoxon skipped (all diffs zero)"
    print(f"\n=== Paired tests across runs ({metric}) ===")
    print(f"Observed mean diff (kNN - Tree): {diff.mean():.4f} ± {diff.std(ddof=1):.4f}")
    print(f"Paired t-test: t={t.statistic:.3f}, p={t.pvalue:.3e}")
    print(f"Wilcoxon: {w_str}")

# =====================================================
# 12) McNemar, permutation, and bootstrap on final test
# =====================================================
# McNemar on the COMMON final test split
try:
    from statsmodels.stats.contingency_tables import mcnemar
    y_pred_knn_final  = final_knn["yhat"]
    y_pred_tree_final = final_tree["yhat"]
    correct_knn  = (y_pred_knn_final == y_test).astype(int)
    correct_tree = (y_pred_tree_final == y_test).astype(int)
    b01 = np.sum((correct_knn == 0) & (correct_tree == 1))  # knn wrong, tree right
    b10 = np.sum((correct_knn == 1) & (correct_tree == 0))  # knn right, tree wrong
    table = [[0, b01],
             [b10, 0]]
    mc = mcnemar(table, exact=True)
    print("\n=== McNemar’s test (final test split) ===")
    print(f"b01={b01}, b10={b10}, statistic={mc.statistic}, p-value={mc.pvalue:.3e}")
except Exception as e:
    print("\n[McNemar skipped] Install statsmodels or check inputs:", e)

# Permutation test (accuracy diff) on final test split
from scipy.stats import permutation_test
acc_knn  = final_knn["accuracy"]
acc_tree = final_tree["accuracy"]

def acc_diff(x, y):
    return accuracy_score(y_test, x) - accuracy_score(y_test, y)

res = permutation_test((y_pred_knn_final, y_pred_tree_final), acc_diff,
                       n_resamples=2000, alternative='two-sided', random_state=RANDOM_SEED+3)
print("\n=== Permutation test (accuracy, final test split) ===")
print(f"Observed diff: {acc_knn - acc_tree:.4f}")
print(f"p-value: {res.pvalue:.3e}")

# Bootstrap CI for accuracy difference (final test split)
rng_boot = np.random.default_rng(RANDOM_SEED + 4)
B = 5000
acc_diffs = []
n = len(y_test)
y_test_np = y_test.to_numpy() if hasattr(y_test, "to_numpy") else np.asarray(y_test)
y_knn_np  = np.asarray(y_pred_knn_final)
y_tree_np = np.asarray(y_pred_tree_final)

for _ in range(B):
    idx = rng_boot.integers(0, n, size=n)
    acc_knn_b  = accuracy_score(y_test_np[idx], y_knn_np[idx])
    acc_tree_b = accuracy_score(y_test_np[idx], y_tree_np[idx])
    acc_diffs.append(acc_knn_b - acc_tree_b)

ci_low, ci_high = np.percentile(acc_diffs, [2.5, 97.5])
print("\n=== Bootstrap 95% CI for accuracy diff (kNN - Tree, final test split) ===")
print(f"[{ci_low:.4f}, {ci_high:.4f}]")

# # ======================================
# # 13) (Optional) Inspect top kNN configs
# # ======================================
# knn_cv = pd.DataFrame(knn_grid.cv_results_).sort_values("mean_test_score", ascending=False)
# cols_knn = ["mean_test_score","std_test_score","param_n_neighbors","param_weights","param_metric","param_p"]
# print("\nTop kNN configs:\n", knn_cv[[c for c in cols_knn if c in knn_cv.columns]].head(10))

# # And top tree configs (if desired)
# tree_cv = pd.DataFrame(tree_rand.cv_results_).sort_values("mean_test_score", ascending=False)
# cols_tree = ["mean_test_score","std_test_score","param_criterion","param_max_depth","param_min_samples_split",
#              "param_min_samples_leaf","param_max_features","param_ccp_alpha","param_class_weight"]
# print("\nTop Tree configs:\n", tree_cv[cols_tree].head(10))


kNN GridSearch: 100%|██████████| 360/360 [12:58<00:00,  2.16s/fit]  



Best kNN params: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
Best kNN CV (F1-weighted): 0.8306


Tree RandomSearch: 100%|██████████| 1200/1200 [00:33<00:00, 35.98fit/s]



Best Tree params: {'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': 20, 'criterion': 'entropy', 'class_weight': None, 'ccp_alpha': 1e-05}
Best Tree CV (F1-weighted): 0.7855

== Final test (common split) ==
kNN (final): Acc=0.9349, Prec=0.9349, Rec=0.9349, F1=0.9349
Decision Tree (final): Acc=0.9139, Prec=0.9138, Rec=0.9139, F1=0.9138

Confusion matrix (kNN):
[[59371  3806     4     0    57    15   299]
 [ 3536 80614   232     2   305   259    43]
 [    1   231  9829    80    18   567     0]
 [    0     1   118   657     0    48     0]
 [   53   467    30     0  2274    22     2]
 [   16   232   524    34    13  4391     0]
 [  284    43     0     0     0     0  5826]]

Confusion matrix (Tree):
[[57861  5346     3     0    38     5   299]
 [ 5675 78565   219     0   327   165    40]
 [    3   312  9885    91    20   415     0]
 [    0     0    92   694     0    38     0]
 [   54   545    33     0  2193    20     3]
 [   20   226   463    43    17  4441 

In [None]:
# Evaluating performace of k=5, k=7
knn_k5 = KNeighborsClassifier(n_neighbors=5, algorithm="brute", weights="distance", metric="minkowski", p=1)
knn_k7 = KNeighborsClassifier(n_neighbors=7, algorithm="brute", weights="distance", metric="minkowski", p=1)

knn_k5.fit(X_knn_train, y_train)
knn_k7.fit(X_knn_train, y_train)

# Evaluate them on the final test split
final_knn_k5 = evaluate("kNN (k=5)", knn_k5, X_knn_test, y_test)
final_knn_k7 = evaluate("kNN (k=7)", knn_k7, X_knn_test, y_test)

print("\n== Final test (common split, extra k values) ==")
for res in [final_knn, final_knn_k5, final_knn_k7, final_tree]:
    print(f"{res['name']}: Acc={res['accuracy']:.4f}, "
          f"Prec={res['precision']:.4f}, "
          f"Rec={res['recall']:.4f}, "
          f"F1={res['f1']:.4f}")

cv_knn_k5 = cross_validate(knn_k5, X_knn_train, y_train,
                           cv=CV_FOLDS, scoring=scorers, n_jobs=-1)
cv_knn_k7 = cross_validate(knn_k7, X_knn_train, y_train,
                           cv=CV_FOLDS, scoring=scorers, n_jobs=-1)


In [52]:
def summarize_cv(cv_res, name):
    df = pd.DataFrame(cv_res)
    mean_std = df.agg(["mean","std"]).round(4).T
    print(f"\n=== {name} CV Results (mean ± std across {CV_FOLDS} folds) ===")
    print(mean_std)

summarize_cv(cv_knn_k5, "kNN (k=5)")
summarize_cv(cv_knn_k7, "kNN (k=7)")


=== kNN (k=5) CV Results (mean ± std across 10 folds) ===
                    mean     std
fit_time          0.5009  0.0155
score_time      791.6071  0.5828
test_accuracy     0.9324  0.0012
test_precision    0.9323  0.0012
test_recall       0.9324  0.0012
test_f1           0.9323  0.0012

=== kNN (k=7) CV Results (mean ± std across 10 folds) ===
                    mean     std
fit_time          0.5312  0.0214
score_time      790.5449  0.5458
test_accuracy     0.9312  0.0014
test_precision    0.9311  0.0015
test_recall       0.9312  0.0014
test_f1           0.9310  0.0014


In [62]:
N_RUNS = 15
RUN_SUBSET_SIZE = 75_000  # number of samples per run
TEST_SIZE = 0.30

rng_global = np.random.default_rng(42)

results = []
per_class_results = []

for run in range(1, N_RUNS + 1):
    subset_idx = rng_global.choice(len(y), size=RUN_SUBSET_SIZE, replace=False)
    X_knn_sub, X_tree_sub, y_sub = (
        X_knn_df.iloc[subset_idx],
        X_tree_df.iloc[subset_idx],
        y.iloc[subset_idx]
    )

    sss = StratifiedShuffleSplit(n_splits=1, test_size=TEST_SIZE, random_state=run)
    (tr, te), = sss.split(X_knn_sub, y_sub)

    X_knn_tr, X_knn_te = X_knn_sub.iloc[tr], X_knn_sub.iloc[te]
    X_tree_tr, X_tree_te = X_tree_sub.iloc[tr], X_tree_sub.iloc[te]
    y_tr, y_te = y_sub.iloc[tr], y_sub.iloc[te]

    knn_run  = KNeighborsClassifier(algorithm="brute", **{k: v for k, v in knn_grid.best_params_.items() if k != "algorithm"})
    tree_run = DecisionTreeClassifier(random_state=run, **tree_rand.best_params_)

    knn_run.fit(X_knn_tr, y_tr);  y_knn = knn_run.predict(X_knn_te)
    tree_run.fit(X_tree_tr, y_tr); y_tree = tree_run.predict(X_tree_te)

    for model, y_pred in [("kNN", y_knn), ("Tree", y_tree)]:
        results.append({
            "run": run,
            "model": model,
            "accuracy":  accuracy_score(y_te, y_pred),
            "precision": precision_score(y_te, y_pred, average="weighted", zero_division=0),
            "recall":    recall_score(y_te, y_pred, average="weighted", zero_division=0),
            "f1":        f1_score(y_te, y_pred, average="weighted", zero_division=0),
        })

        report = classification_report(y_te, y_pred, zero_division=0, output_dict=True)
        for label, vals in report.items():
            if label not in ["accuracy", "macro avg", "weighted avg"]:
                per_class_results.append({
                    "run": run,
                    "model": model,
                    "class": label,
                    "precision": vals["precision"],
                    "recall": vals["recall"],
                    "f1": vals["f1-score"],
                    "support": vals["support"]
                })

df_results = pd.DataFrame(results)
df_per_class = pd.DataFrame(per_class_results)

# --- Aggregate Mean ± Std
summary_global = (
    df_results
    .groupby("model")[["accuracy","precision","recall","f1"]]
    .agg(["mean","std"])
    .round(4)
)
print("\n=== Global Mean ± Std across runs ===")
print(summary_global)

summary_per_class = (
    df_per_class
    .groupby(["model","class"])[["precision","recall","f1"]]
    .agg(["mean","std"])
    .round(4)
)
print("\n=== Per-class Mean ± Std across runs ===")
print(summary_per_class)


=== Global Mean ± Std across runs ===
      accuracy         precision          recall              f1        
          mean     std      mean     std    mean     std    mean     std
model                                                                   
Tree    0.8229  0.0043    0.8226  0.0044  0.8229  0.0043  0.8225  0.0044
kNN     0.8636  0.0023    0.8630  0.0023  0.8636  0.0023  0.8631  0.0023

=== Per-class Mean ± Std across runs ===
            precision          recall              f1        
                 mean     std    mean     std    mean     std
model class                                                  
Tree  1        0.8044  0.0072  0.8306  0.0102  0.8173  0.0060
      2        0.8511  0.0068  0.8421  0.0068  0.8465  0.0040
      3        0.8142  0.0131  0.8233  0.0155  0.8186  0.0106
      4        0.7122  0.0466  0.6732  0.0619  0.6908  0.0467
      5        0.6187  0.0287  0.5267  0.0324  0.5685  0.0263
      6        0.6872  0.0183  0.6320  0.0266  0.6581  0.0