<a href="https://colab.research.google.com/github/JoshuaGottlieb/SHAP-Feature-Selection/blob/martin-branch/Copy_of_Capstone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations and Loading Phase

In [9]:
# Phase 1, Step 1: Install and Import Libraries

# Install external libs (only once in Colab)
!pip -q install xgboost shap

# Core libraries
import numpy as np
import pandas as pd
import warnings

# Visualization + explainability
import shap

# Machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier

# Preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
import time, pickle
from pathlib import Path

# Gradient boosting
import xgboost as xgb

# Misc
from itertools import product
from scipy import sparse

# Turn off warnings for cleaner output
warnings.filterwarnings("ignore")

print("Libraries installed and imported")


Libraries installed and imported


In [2]:
# Phase 1, Step 2a: Upload files manually from your computer ===
from google.colab import files
uploaded = files.upload()

# Verify uploaded files
print("Uploaded files:", list(uploaded.keys()))


Saving uci_android_permissions.csv.gz to uci_android_permissions.csv.gz
Saving uci_indian_liver.csv.gz to uci_indian_liver.csv.gz
Saving uci_mushroom.csv.gz to uci_mushroom.csv.gz
Saving uci_phishing_url.csv.gz to uci_phishing_url.csv.gz
Saving uci_secondary_mushroom.csv.gz to uci_secondary_mushroom.csv.gz
Uploaded files: ['uci_android_permissions.csv.gz', 'uci_indian_liver.csv.gz', 'uci_mushroom.csv.gz', 'uci_phishing_url.csv.gz', 'uci_secondary_mushroom.csv.gz']


In [3]:
# Phase 1, Step 2b: Load the CSV datasets

# File names (make sure these are in your Colab working directory or Google Drive mount)
FILES = [
    "uci_android_permissions.csv.gz",
    "uci_indian_liver.csv.gz",
    "uci_mushroom.csv.gz",
    "uci_phishing_url.csv.gz",
    "uci_secondary_mushroom.csv.gz",
]

# Dictionary to hold raw dataframes
raw_datasets = {}

# Load each dataset
for file in FILES:
    try:
        df = pd.read_csv(file, low_memory=False)
        raw_datasets[file] = df
        print(f"Loaded {file}: shape = {df.shape}")
    except Exception as e:
        print(f"Error loading {file}: {e}")

print("\n All datasets attempted. Check above for shapes/errors.")


Loaded uci_android_permissions.csv.gz: shape = (29332, 87)
Loaded uci_indian_liver.csv.gz: shape = (583, 11)
Loaded uci_mushroom.csv.gz: shape = (8124, 24)
Loaded uci_phishing_url.csv.gz: shape = (235795, 56)
Loaded uci_secondary_mushroom.csv.gz: shape = (61069, 21)

 All datasets attempted. Check above for shapes/errors.


## Checking for missing values

In [4]:
# Phase 1, Step 3: Inspect missing values in each dataset

def check_missing(df):
    """Return columns with missing values and counts."""
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    return missing

for name, df in raw_datasets.items():
    print(f"\n===== {name} =====")
    print(f"Shape: {df.shape}")
    missing = check_missing(df)
    if missing.empty:
        print("No missing values")
    else:
        print("Missing values per column:")
        print(missing)



===== uci_android_permissions.csv.gz =====
Shape: (29332, 87)
No missing values

===== uci_indian_liver.csv.gz =====
Shape: (583, 11)
Missing values per column:
almumin_globulin_ratio    4
dtype: int64

===== uci_mushroom.csv.gz =====
Shape: (8124, 24)
No missing values

===== uci_phishing_url.csv.gz =====
Shape: (235795, 56)
No missing values

===== uci_secondary_mushroom.csv.gz =====
Shape: (61069, 21)
Missing values per column:
veil_type            57892
spore_print_color    54715
veil_color           53656
stem_root            51538
stem_surface         38124
gill_spacing         25063
cap_surface          14120
gill_attachment       9884
ring_type             2471
dtype: int64


## Cleaning phase

In [5]:
# Phase 1, Step 4: Clean datasets (drop hi-missing cols, then impute)

# policy knobs
HIGH_MISS_THRESHOLD = 0.60  # drop columns whose NaN rate > 60%
DROP_COLS = {
    "uci_phishing_url.csv.gz": ["url", "title"],   # high-cardinality free text
    # add others if needed
}

cleaned_datasets = {}

def clean_one(name, df):
    dfc = df.copy()
    # 0) optional per-dataset drops
    for c in DROP_COLS.get(name, []):
        if c in dfc.columns:
            dfc.drop(columns=c, inplace=True)

    # 1) drop columns with too many missing
    miss_frac = dfc.isna().mean()
    to_drop = miss_frac[miss_frac > HIGH_MISS_THRESHOLD].index.tolist()
    if to_drop:
        dfc.drop(columns=to_drop, inplace=True)

    # 2) impute remaining NaNs: numericâ†’median, categoricalâ†’mode
    na_cols = [c for c in dfc.columns if dfc[c].isna().any()]
    for c in na_cols:
        if dfc[c].dtype == "object":
            mode_val = dfc[c].mode(dropna=True)
            fill_val = mode_val.iloc[0] if not mode_val.empty else ""
        else:
            fill_val = dfc[c].median()
        dfc[c] = dfc[c].fillna(fill_val)

    # 3) report + return
    print(f"\n===== {name} =====")
    print(f"Dropped high-missing cols (> {int(HIGH_MISS_THRESHOLD*100)}% NA): {to_drop if to_drop else 'none'}")
    print(f"Shape after clean: {dfc.shape}")
    rem = dfc.isna().sum()
    rem = rem[rem > 0]
    print("Remaining missing values:", int(rem.sum()))
    return dfc

for name, df in raw_datasets.items():
    cleaned_datasets[name] = clean_one(name, df)

# quick final check
print("\n Cleaning complete. Summary:")
for name, df in cleaned_datasets.items():
    print(f"{name}: shape={df.shape}, remaining_NA={int(df.isna().sum().sum())}")



===== uci_android_permissions.csv.gz =====
Dropped high-missing cols (> 60% NA): none
Shape after clean: (29332, 87)
Remaining missing values: 0

===== uci_indian_liver.csv.gz =====
Dropped high-missing cols (> 60% NA): none
Shape after clean: (583, 11)
Remaining missing values: 0

===== uci_mushroom.csv.gz =====
Dropped high-missing cols (> 60% NA): none
Shape after clean: (8124, 24)
Remaining missing values: 0

===== uci_phishing_url.csv.gz =====
Dropped high-missing cols (> 60% NA): none
Shape after clean: (235795, 54)
Remaining missing values: 0

===== uci_secondary_mushroom.csv.gz =====
Dropped high-missing cols (> 60% NA): ['stem_root', 'stem_surface', 'veil_type', 'veil_color', 'spore_print_color']
Shape after clean: (61069, 16)
Remaining missing values: 0

 Cleaning complete. Summary:
uci_android_permissions.csv.gz: shape=(29332, 87), remaining_NA=0
uci_indian_liver.csv.gz: shape=(583, 11), remaining_NA=0
uci_mushroom.csv.gz: shape=(8124, 24), remaining_NA=0
uci_phishing_url.c

In [6]:
# === Phase 1.5: dataset config (needed for Phase 2 and later) ===
TARGETS = {
    "uci_android_permissions.csv.gz": "result",
    "uci_indian_liver.csv.gz":        "has_liver_disease",
    "uci_mushroom.csv.gz":            "poisonous",
    "uci_phishing_url.csv.gz":        "label",
    "uci_secondary_mushroom.csv.gz":  "class",
}

DROP_COLS = {
    "uci_phishing_url.csv.gz": ["url", "title"],  # high-cardinality text cols, dropped in cleaning
}


In [7]:
# Phase 2 â€” Step 2 single split + smart preprocessing per dataset
try:
    from sklearn.preprocessing import TargetEncoder as SK_TargetEncoder  # sklearn â‰¥ 1.5
    _TargetEncoderClass = SK_TargetEncoder
    _te_is_sklearn = True
except Exception:
    _te_is_sklearn = False
    try:
        import category_encoders as ce  # pip install category_encoders (once)
        _TargetEncoderClass = ce.TargetEncoder
    except Exception as e:
        raise RuntimeError(
            "TargetEncoder not found. Install scikit-learn>=1.5 or `pip install category_encoders`."
        )

def _split_once(df, target, drops):
    df = df.drop(columns=[c for c in (drops or []) if c in df.columns], errors="ignore").copy()
    y = df[target]
    X = df.drop(columns=[target])
    if y.dtype == "object":
        uniq = sorted(y.dropna().unique().tolist())
        if len(uniq) == 2:
            y = y.map({uniq[0]: 0, uniq[1]: 1})
    return X, y.astype(int)

def _column_buckets(X):
    num = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]
    return num, cat

def _categorical_groups(X_train, cat_cols):
    bins, lows, highs = [], [], []
    for c in cat_cols:
        n = X_train[c].dropna().nunique()
        if n <= 2:
            bins.append(c)          # binary â†’ label encode (0/1)
        elif n <= 10:
            lows.append(c)          # low-card â†’ one-hot
        else:
            highs.append(c)         # high-card â†’ target encode
    return bins, lows, highs

def _fit_binary_maps(X_train, bin_cols):
    maps = {}
    for c in bin_cols:
        uniq = list(pd.Series(X_train[c]).dropna().unique())
        if len(uniq) <= 1:
            maps[c] = {uniq[0]: 0} if len(uniq) == 1 else {}
        else:
            uniq_sorted = sorted(uniq, key=lambda x: str(x))
            maps[c] = {uniq_sorted[0]: 0, uniq_sorted[1]: 1}
    return maps

def _apply_binary_maps(X, maps):
    X2 = X.copy()
    for c, m in maps.items():
        X2[c] = X2[c].map(m).fillna(-1)  # unseen â†’ -1
    return X2

def _build_preprocessor_for_train(Xtr, ytr, num_cols, bin_cols, low_cols, high_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", StandardScaler(with_mean=False), num_cols))
    if bin_cols:
        transformers.append(("bin", "passthrough", bin_cols))
    if low_cols:
        # FIX: use sparse_output (new sklearn); returns dense arrays for downstream
        transformers.append(("low", OneHotEncoder(handle_unknown="ignore", sparse_output=False), low_cols))
    if high_cols:
        if _te_is_sklearn:
            te = _TargetEncoderClass()
        else:
            te = _TargetEncoderClass(smoothing=5.0)
        transformers.append(("high", te, high_cols))
    pre = ColumnTransformer(transformers=transformers, remainder="drop", sparse_threshold=0.0)
    pre.fit(Xtr, ytr)  # y passed for TargetEncoder
    return pre

prep2 = {}

for name, df in cleaned_datasets.items():
    if name not in TARGETS:
        continue

    # one split per dataset (reused across all models)
    X, y = _split_once(df, TARGETS[name], DROP_COLS.get(name, []))
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=0.2,
        stratify=y if y.nunique() == 2 else None,
        random_state=42
    )

    # decide encodings from TRAIN only
    num_cols, cat_cols = _column_buckets(Xtr)
    bin_cols, low_cols, high_cols = _categorical_groups(Xtr, cat_cols)

    # binary maps learned on train; apply to train & test
    bin_maps = _fit_binary_maps(Xtr, bin_cols)
    Xtr_b = _apply_binary_maps(Xtr, bin_maps)
    Xte_b = _apply_binary_maps(Xte, bin_maps)

    # column transformer (with TargetEncoder for high-card cats)
    pre = _build_preprocessor_for_train(Xtr_b, ytr, num_cols, bin_cols, low_cols, high_cols)

    # transform once â†’ features for trees/XGB
    Xtr_tree = pre.transform(Xtr_b)
    Xte_tree = pre.transform(Xte_b)

    # final global scaler for linear models (LR/SVM)
    fin_scaler = StandardScaler(with_mean=False)
    Xtr_lin = fin_scaler.fit_transform(Xtr_tree)
    Xte_lin = fin_scaler.transform(Xte_tree)

    prep2[name] = {
        "ytr": ytr, "yte": yte,
        "Xtr_tree": Xtr_tree, "Xte_tree": Xte_tree,    # for DT/RF/XGB
        "Xtr_lin":  Xtr_lin,  "Xte_lin":  Xte_lin,     # for LR/SVM
        "pre": pre,
        "bin_maps": bin_maps,
        "cols": {"num": num_cols, "bin": bin_cols, "low": low_cols, "high": high_cols},
    }

print("Step 2 updated and completed (OneHotEncoder uses sparse_output=False).")


Step 2 updated and completed (OneHotEncoder uses sparse_output=False).


## Baseline Training Phase

In [10]:
DATASET_ORDER = [
    "uci_adroid_permission",
    "indian_liver",
    "uci_mushroom",
    "uci_phising",
    "uci_secondary_mushroom"
]

RUN_DIR = Path("model_Trained_data") / time.strftime("%Y%m%d-%H%M%S")
RUN_DIR.mkdir(parents=True, exist_ok=True)

SVC_TRAIN_CAP = 50_000  # optional safety cap
N_JOBS = -1


# UTILITY FUNCTIONS

def _is_binary(y):
    try:
        return (y.nunique() == 2)
    except Exception:
        return (pd.Series(y).nunique() == 2)

def _evaluate(model, Xte, yte, binary):
    """Compute accuracy, f1, and AUC metrics."""
    p = model.predict(Xte)
    pro = None
    if binary and hasattr(model, "predict_proba"):
        pro = model.predict_proba(Xte)[:, 1]
    return {
        "accuracy": accuracy_score(yte, p),
        "f1_weighted": f1_score(yte, p, average="weighted", zero_division=0),
        "f1_macro": f1_score(yte, p, average="macro", zero_division=0),
        "roc_auc": roc_auc_score(yte, pro) if (binary and pro is not None) else None,
        "pr_auc": average_precision_score(yte, pro) if (binary and pro is not None) else None,
    }


# BASELINE TRAINING LOOP

ordered_keys = [k for k in DATASET_ORDER if k in prep2] + [k for k in prep2.keys() if k not in DATASET_ORDER]

models_trained = {}
baseline_rows = []

for ds in ordered_keys:
    pack = prep2[ds]
    Xtr_tree, Xte_tree = pack["Xtr_tree"], pack["Xte_tree"]
    Xtr_lin,  Xte_lin  = pack["Xtr_lin"],  pack["Xte_lin"]
    ytr, yte = pack["ytr"], pack["yte"]
    binary = _is_binary(ytr)

    print(f"\n\n============================")
    print(f"ðŸ”¹ DATASET: {ds}")
    print(f"============================")

    models_trained.setdefault(ds, {})

    #  Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(Xtr_tree, ytr)
    m = _evaluate(dt, Xte_tree, yte, binary)
    models_trained[ds]["DT"] = dt
    print("\nDecision Tree:")
    for k, v in m.items():
        print(f"   {k:>12}: {v:.4f}" if v is not None else f"   {k:>12}: â€”")
    baseline_rows.append({"dataset": ds, "model": "DT", **m})

    #  Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=N_JOBS)
    rf.fit(Xtr_tree, ytr)
    m = _evaluate(rf, Xte_tree, yte, binary)
    models_trained[ds]["RF"] = rf
    print("\nRandom Forest:")
    for k, v in m.items():
        print(f"   {k:>12}: {v:.4f}" if v is not None else f"   {k:>12}: â€”")
    baseline_rows.append({"dataset": ds, "model": "RF", **m})

    #  XGBoost
    xgbc = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42, n_jobs=N_JOBS)
    xgbc.fit(Xtr_tree, ytr)
    m = _evaluate(xgbc, Xte_tree, yte, binary)
    models_trained[ds]["XGB"] = xgbc
    print("\nXGBoost:")
    for k, v in m.items():
        print(f"   {k:>12}: {v:.4f}" if v is not None else f"   {k:>12}: â€”")
    baseline_rows.append({"dataset": ds, "model": "XGB", **m})

    #  Logistic Regression
    lr = LogisticRegression(max_iter=3000, n_jobs=N_JOBS)
    lr.fit(Xtr_lin, ytr)
    m = _evaluate(lr, Xte_lin, yte, binary)
    models_trained[ds]["LR"] = lr
    print("\nLogistic Regression:")
    for k, v in m.items():
        print(f"   {k:>12}: {v:.4f}" if v is not None else f"   {k:>12}: â€”")
    baseline_rows.append({"dataset": ds, "model": "LR", **m})

    #  Support Vector Classifier
    if SVC_TRAIN_CAP and len(ytr) > SVC_TRAIN_CAP:
        idx = np.random.RandomState(42).choice(len(ytr), size=SVC_TRAIN_CAP, replace=False)
        Xtr_svc, ytr_svc = Xtr_lin[idx], ytr.iloc[idx]
    else:
        Xtr_svc, ytr_svc = Xtr_lin, ytr

    svc = SVC(kernel="rbf", probability=True, random_state=42)
    svc.fit(Xtr_svc, ytr_svc)
    m = _evaluate(svc, Xte_lin, yte, binary)
    models_trained[ds]["SVC"] = svc
    print("\nSupport Vector Classifier:")
    for k, v in m.items():
        print(f"   {k:>12}: {v:.4f}" if v is not None else f"   {k:>12}: â€”")
    baseline_rows.append({"dataset": ds, "model": "SVC", **m})

# DISPLAY BASELINE TABLE

baseline_df = pd.DataFrame(baseline_rows)
baseline_df = baseline_df.sort_values(["dataset", "f1_weighted", "accuracy"], ascending=[True, False, False])
display(baseline_df)

# SAVE TRAINED MODELS

for ds, model_dict in models_trained.items():
    ds_dir = RUN_DIR / ds
    ds_dir.mkdir(parents=True, exist_ok=True)
    for model_name, est in model_dict.items():
        with open(ds_dir / f"{model_name}.pkl", "wb") as f:
            pickle.dump(est, f)

print(f"\n Baseline models saved under: {RUN_DIR}")



ðŸ”¹ DATASET: uci_android_permissions.csv.gz

Decision Tree:
       accuracy: 0.9625
    f1_weighted: 0.9625
       f1_macro: 0.9625
        roc_auc: 0.9748
         pr_auc: 0.9666

Random Forest:
       accuracy: 0.9697
    f1_weighted: 0.9697
       f1_macro: 0.9697
        roc_auc: 0.9927
         pr_auc: 0.9937

XGBoost:
       accuracy: 0.9673
    f1_weighted: 0.9673
       f1_macro: 0.9673
        roc_auc: 0.9935
         pr_auc: 0.9940

Logistic Regression:
       accuracy: 0.9576
    f1_weighted: 0.9576
       f1_macro: 0.9576
        roc_auc: 0.9880
         pr_auc: 0.9889

Support Vector Classifier:
       accuracy: 0.9639
    f1_weighted: 0.9639
       f1_macro: 0.9639
        roc_auc: 0.9882
         pr_auc: 0.9891


ðŸ”¹ DATASET: uci_indian_liver.csv.gz

Decision Tree:
       accuracy: 0.5470
    f1_weighted: 0.5406
       f1_macro: 0.4355
        roc_auc: 0.4376
         pr_auc: 0.6850

Random Forest:
       accuracy: 0.7521
    f1_weighted: 0.7273
       f1_macro: 0.64

Unnamed: 0,dataset,model,accuracy,f1_weighted,f1_macro,roc_auc,pr_auc
1,uci_android_permissions.csv.gz,RF,0.969661,0.969661,0.969661,0.992672,0.993714
2,uci_android_permissions.csv.gz,XGB,0.967275,0.967274,0.967274,0.993537,0.99401
4,uci_android_permissions.csv.gz,SVC,0.963866,0.963866,0.963866,0.988229,0.98914
0,uci_android_permissions.csv.gz,DT,0.962502,0.962501,0.962501,0.974796,0.966635
3,uci_android_permissions.csv.gz,LR,0.957559,0.957559,0.957559,0.988026,0.988941
6,uci_indian_liver.csv.gz,RF,0.752137,0.727332,0.646305,0.756201,0.898245
8,uci_indian_liver.csv.gz,LR,0.735043,0.67536,0.558383,0.830971,0.934957
7,uci_indian_liver.csv.gz,XGB,0.700855,0.670918,0.573126,0.727853,0.886988
9,uci_indian_liver.csv.gz,SVC,0.709402,0.588803,0.415,0.692417,0.873874
5,uci_indian_liver.csv.gz,DT,0.547009,0.540576,0.435503,0.437633,0.684956



 Baseline models saved under: model_Trained_data/20251022-204146


## This phase is where we fine tuned by the use of hyper parameters


In [15]:
# === Hyperparameter Tuning for First 3 Small Datasets ===
# (Skip Phishing and Mushroom for now)

DATASETS_TO_TUNE = [
    "uci_android_permissions.csv.gz",
    "uci_indian_liver.csv.gz",
    "uci_mushroom.csv.gz"
]

RUN_DIR = Path("model_Tuned_data") / time.strftime("%Y%m%d-%H%M%S")
RUN_DIR.mkdir(parents=True, exist_ok=True)

SVC_TRAIN_CAP = 50_000
N_JOBS = -1
CV_FOLDS = 3

def _is_binary(y):
    try:
        return (y.nunique() == 2)
    except Exception:
        return (pd.Series(y).nunique() == 2)

def _evaluate(model, Xte, yte, binary):
    p = model.predict(Xte)
    pro = None
    if binary and hasattr(model, "predict_proba"):
        pro = model.predict_proba(Xte)[:, 1]
    return {
        "accuracy": accuracy_score(yte, p),
        "f1_weighted": f1_score(yte, p, average="weighted", zero_division=0),
        "f1_macro": f1_score(yte, p, average="macro", zero_division=0),
        "roc_auc": roc_auc_score(yte, pro) if (binary and pro is not None) else None,
        "pr_auc": average_precision_score(yte, pro) if (binary and pro is not None) else None,
    }

GRID_PARAMS = {
    "DT": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [None, 5, 10, 20],
            "min_samples_split": [2, 5, 10],
            "criterion": ["gini", "entropy"]
        }
    },
    "RF": {
        "model": RandomForestClassifier(random_state=42, n_jobs=N_JOBS),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        }
    },
    "XGB": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42, n_jobs=N_JOBS),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.05, 0.1, 0.2],
            "max_depth": [3, 5, 7]
        }
    },
    "LR": {
        "model": LogisticRegression(max_iter=3000, n_jobs=N_JOBS),
        "params": {
            "C": [0.1, 1, 10],
            "solver": ["lbfgs", "liblinear"]
        }
    },
    "SVC": {
        "model": SVC(probability=True, random_state=42),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto"]
        }
    },
}

tuned_models = {}
tuning_rows = []

for ds in DATASETS_TO_TUNE:
    pack = prep2[ds]
    Xtr_tree, Xte_tree = pack["Xtr_tree"], pack["Xte_tree"]
    Xtr_lin, Xte_lin = pack["Xtr_lin"], pack["Xte_lin"]
    ytr, yte = pack["ytr"], pack["yte"]
    binary = _is_binary(ytr)

    print(f"\nDATASET: {ds}")
    tuned_models.setdefault(ds, {})

    for model_name, spec in GRID_PARAMS.items():
        model, grid = spec["model"], spec["params"]
        Xtr_use, Xte_use = (Xtr_lin, Xte_lin) if model_name in ["SVC", "LR"] else (Xtr_tree, Xte_tree)

        if model_name == "SVC" and SVC_TRAIN_CAP and len(ytr) > SVC_TRAIN_CAP:
            idx = np.random.RandomState(42).choice(len(ytr), size=SVC_TRAIN_CAP, replace=False)
            Xtr_use, ytr_use = Xtr_use[idx], ytr.iloc[idx]
        else:
            ytr_use = ytr

        print(f"\nTuning {model_name}...")
        gridcv = GridSearchCV(model, grid, cv=CV_FOLDS, scoring="f1_weighted", n_jobs=N_JOBS)
        gridcv.fit(Xtr_use, ytr_use)

        best_model = gridcv.best_estimator_
        best_params = gridcv.best_params_
        scores = _evaluate(best_model, Xte_use, yte, binary)

        print(f"Model: {model_name}")
        print(f"Best Params: {best_params}")
        for k, v in scores.items():
            print(f"   {k}: {v:.4f}" if v is not None else f"   {k}: â€”")

        tuned_models[ds][model_name] = best_model
        tuning_rows.append({
            "dataset": ds,
            "model": model_name,
            **best_params,
            **scores
        })

    ds_dir = RUN_DIR / ds
    ds_dir.mkdir(parents=True, exist_ok=True)
    for model_name, est in tuned_models[ds].items():
        with open(ds_dir / f"{model_name}_tuned.pkl", "wb") as f:
            pickle.dump(est, f)

print(f"\nAll tuned models saved under: {RUN_DIR}")

tuning_df = pd.DataFrame(tuning_rows)
tuning_df = tuning_df.sort_values(["dataset", "f1_weighted", "accuracy"], ascending=[True, False, False])
display(tuning_df)

summary_path = RUN_DIR / "tuning_summary.csv"
tuning_df.to_csv(summary_path, index=False)
print(f"Summary saved to: {summary_path}")


DATASET: uci_android_permissions.csv.gz

Tuning DT...
Model: DT
Best Params: {'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 2}
   accuracy: 0.9632
   f1_weighted: 0.9632
   f1_macro: 0.9632
   roc_auc: 0.9759
   pr_auc: 0.9664

Tuning RF...
Model: RF
Best Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
   accuracy: 0.9700
   f1_weighted: 0.9700
   f1_macro: 0.9700
   roc_auc: 0.9930
   pr_auc: 0.9939

Tuning XGB...
Model: XGB
Best Params: {'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200}
   accuracy: 0.9700
   f1_weighted: 0.9700
   f1_macro: 0.9700
   roc_auc: 0.9941
   pr_auc: 0.9944

Tuning LR...
Model: LR
Best Params: {'C': 1, 'solver': 'lbfgs'}
   accuracy: 0.9576
   f1_weighted: 0.9576
   f1_macro: 0.9576
   roc_auc: 0.9880
   pr_auc: 0.9889

Tuning SVC...
Model: SVC
Best Params: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
   accuracy: 0.9656
   f1_weighted: 0.9656
   f1_macro: 0.9656
   roc_auc: 0.9892
   pr_auc: 0.9903

DATASET:

Unnamed: 0,dataset,model,criterion,max_depth,min_samples_split,accuracy,f1_weighted,f1_macro,roc_auc,pr_auc,n_estimators,learning_rate,C,solver,gamma,kernel
1,uci_android_permissions.csv.gz,RF,,,5.0,0.970002,0.970001,0.970001,0.992959,0.993937,200.0,,,,,
2,uci_android_permissions.csv.gz,XGB,,7.0,,0.970002,0.97,0.970001,0.994093,0.994385,200.0,0.2,,,,
4,uci_android_permissions.csv.gz,SVC,,,,0.96557,0.96557,0.96557,0.989204,0.990329,,,10.0,,scale,rbf
0,uci_android_permissions.csv.gz,DT,gini,20.0,2.0,0.963184,0.963183,0.963183,0.97585,0.966446,,,,,,
3,uci_android_permissions.csv.gz,LR,,,,0.957559,0.957559,0.957559,0.988026,0.988941,,,1.0,lbfgs,,
6,uci_indian_liver.csv.gz,RF,,,2.0,0.752137,0.727332,0.646305,0.756201,0.898245,200.0,,,,,
8,uci_indian_liver.csv.gz,LR,,,,0.74359,0.697277,0.594126,0.825656,0.932376,,,10.0,lbfgs,,
7,uci_indian_liver.csv.gz,XGB,,5.0,,0.717949,0.677749,0.574076,0.740964,0.894411,200.0,0.05,,,,
9,uci_indian_liver.csv.gz,SVC,,,,0.717949,0.663031,0.546032,0.703756,0.87697,,,10.0,,auto,rbf
5,uci_indian_liver.csv.gz,DT,gini,,10.0,0.581197,0.589036,0.51221,0.533841,0.73846,,,,,,


Summary saved to: model_Tuned_data/20251022-221726/tuning_summary.csv


In [20]:
# Download ONLY .pkl files from the LATEST runs in model_Trained_data/ and model_Tuned_data/
import os, shutil, time
from pathlib import Path
from google.colab import files

TRAIN_BASE = Path("model_Trained_data")
TUNED_BASE = Path("model_Tuned_data")

def _latest_run(base: Path):
    if not base.exists(): return None
    runs = [p for p in base.iterdir() if p.is_dir()]
    if not runs: return None
    runs.sort(key=lambda p: p.stat().st_mtime, reverse=True)
    return runs[0]

stamp = time.strftime("%Y%m%d-%H%M%S")
export_dir = Path(f"pickles_latest_{stamp}")
export_dir.mkdir(parents=True, exist_ok=True)

for base, label in [(TRAIN_BASE, "trained"), (TUNED_BASE, "tuned")]:
    run = _latest_run(base)
    if run is None:
        print(f"No {label} run found at {base}")
        continue
    dest = export_dir / label / run.name
    dest.mkdir(parents=True, exist_ok=True)
    for root, _, files_ in os.walk(run):
        for fname in files_:
            if fname.lower().endswith(".pkl"):
                src = Path(root) / fname
                rel = Path(root).relative_to(run)
                (dest / rel).mkdir(parents=True, exist_ok=True)
                shutil.copy2(src, dest / rel / fname)

zip_path = shutil.make_archive(str(export_dir), "zip", root_dir=export_dir)
files.download(zip_path)
print(f"Downloaded: {zip_path}")


RuntimeError: File size too large, try using force_zip64