<a href="https://colab.research.google.com/github/JoshuaGottlieb/SHAP-Feature-Selection/blob/martin-branch/Capstone_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations and Loading Phase

In [1]:
# Phase 1, Step 1: Install and Import Libraries

# Install external libs (only once in Colab)
!pip -q install xgboost shap

# Core libraries
import numpy as np
import pandas as pd
import warnings

# Visualization + explainability
import shap

# Machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier

# Preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
from sklearn.preprocessing import StandardScaler


# Gradient boosting
import xgboost as xgb

# Misc
from itertools import product
from scipy import sparse

# Turn off warnings for cleaner output
warnings.filterwarnings("ignore")

print("Libraries installed and imported")


Libraries installed and imported


In [2]:
# Phase 1, Step 2a: Upload files manually from your computer ===
from google.colab import files
uploaded = files.upload()

# Verify uploaded files
print("Uploaded files:", list(uploaded.keys()))


Saving uci_android_permissions.csv.gz to uci_android_permissions.csv.gz
Saving uci_indian_liver.csv.gz to uci_indian_liver.csv.gz
Saving uci_mushroom.csv.gz to uci_mushroom.csv.gz
Saving uci_phishing_url.csv.gz to uci_phishing_url.csv.gz
Saving uci_secondary_mushroom.csv.gz to uci_secondary_mushroom.csv.gz
Uploaded files: ['uci_android_permissions.csv.gz', 'uci_indian_liver.csv.gz', 'uci_mushroom.csv.gz', 'uci_phishing_url.csv.gz', 'uci_secondary_mushroom.csv.gz']


In [3]:
# Phase 1, Step 2b: Load the CSV datasets

# File names (make sure these are in your Colab working directory or Google Drive mount)
FILES = [
    "uci_android_permissions.csv.gz",
    "uci_indian_liver.csv.gz",
    "uci_mushroom.csv.gz",
    "uci_phishing_url.csv.gz",
    "uci_secondary_mushroom.csv.gz",
]

# Dictionary to hold raw dataframes
raw_datasets = {}

# Load each dataset
for file in FILES:
    try:
        df = pd.read_csv(file, low_memory=False)
        raw_datasets[file] = df
        print(f"Loaded {file}: shape = {df.shape}")
    except Exception as e:
        print(f"Error loading {file}: {e}")

print("\n All datasets attempted. Check above for shapes/errors.")


Loaded uci_android_permissions.csv.gz: shape = (29332, 87)
Loaded uci_indian_liver.csv.gz: shape = (583, 11)
Loaded uci_mushroom.csv.gz: shape = (8124, 24)
Loaded uci_phishing_url.csv.gz: shape = (235795, 56)
Loaded uci_secondary_mushroom.csv.gz: shape = (61069, 21)

 All datasets attempted. Check above for shapes/errors.


## Checking for missing values

In [4]:
# Phase 1, Step 3: Inspect missing values in each dataset

def check_missing(df):
    """Return columns with missing values and counts."""
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    return missing

for name, df in raw_datasets.items():
    print(f"\n===== {name} =====")
    print(f"Shape: {df.shape}")
    missing = check_missing(df)
    if missing.empty:
        print("No missing values")
    else:
        print("Missing values per column:")
        print(missing)



===== uci_android_permissions.csv.gz =====
Shape: (29332, 87)
No missing values

===== uci_indian_liver.csv.gz =====
Shape: (583, 11)
Missing values per column:
almumin_globulin_ratio    4
dtype: int64

===== uci_mushroom.csv.gz =====
Shape: (8124, 24)
No missing values

===== uci_phishing_url.csv.gz =====
Shape: (235795, 56)
No missing values

===== uci_secondary_mushroom.csv.gz =====
Shape: (61069, 21)
Missing values per column:
veil_type            57892
spore_print_color    54715
veil_color           53656
stem_root            51538
stem_surface         38124
gill_spacing         25063
cap_surface          14120
gill_attachment       9884
ring_type             2471
dtype: int64


## Cleaning phase

In [5]:
# Phase 1, Step 4: Clean datasets (drop hi-missing cols, then impute)

# policy knobs
HIGH_MISS_THRESHOLD = 0.60  # drop columns whose NaN rate > 60%
DROP_COLS = {
    "uci_phishing_url.csv.gz": ["url", "title"],   # high-cardinality free text
    # add others if needed
}

cleaned_datasets = {}

def clean_one(name, df):
    dfc = df.copy()
    # 0) optional per-dataset drops
    for c in DROP_COLS.get(name, []):
        if c in dfc.columns:
            dfc.drop(columns=c, inplace=True)

    # 1) drop columns with too many missing
    miss_frac = dfc.isna().mean()
    to_drop = miss_frac[miss_frac > HIGH_MISS_THRESHOLD].index.tolist()
    if to_drop:
        dfc.drop(columns=to_drop, inplace=True)

    # 2) impute remaining NaNs: numeric→median, categorical→mode
    na_cols = [c for c in dfc.columns if dfc[c].isna().any()]
    for c in na_cols:
        if dfc[c].dtype == "object":
            mode_val = dfc[c].mode(dropna=True)
            fill_val = mode_val.iloc[0] if not mode_val.empty else ""
        else:
            fill_val = dfc[c].median()
        dfc[c] = dfc[c].fillna(fill_val)

    # 3) report + return
    print(f"\n===== {name} =====")
    print(f"Dropped high-missing cols (> {int(HIGH_MISS_THRESHOLD*100)}% NA): {to_drop if to_drop else 'none'}")
    print(f"Shape after clean: {dfc.shape}")
    rem = dfc.isna().sum()
    rem = rem[rem > 0]
    print("Remaining missing values:", int(rem.sum()))
    return dfc

for name, df in raw_datasets.items():
    cleaned_datasets[name] = clean_one(name, df)

# quick final check
print("\n Cleaning complete. Summary:")
for name, df in cleaned_datasets.items():
    print(f"{name}: shape={df.shape}, remaining_NA={int(df.isna().sum().sum())}")



===== uci_android_permissions.csv.gz =====
Dropped high-missing cols (> 60% NA): none
Shape after clean: (29332, 87)
Remaining missing values: 0

===== uci_indian_liver.csv.gz =====
Dropped high-missing cols (> 60% NA): none
Shape after clean: (583, 11)
Remaining missing values: 0

===== uci_mushroom.csv.gz =====
Dropped high-missing cols (> 60% NA): none
Shape after clean: (8124, 24)
Remaining missing values: 0

===== uci_phishing_url.csv.gz =====
Dropped high-missing cols (> 60% NA): none
Shape after clean: (235795, 54)
Remaining missing values: 0

===== uci_secondary_mushroom.csv.gz =====
Dropped high-missing cols (> 60% NA): ['stem_root', 'stem_surface', 'veil_type', 'veil_color', 'spore_print_color']
Shape after clean: (61069, 16)
Remaining missing values: 0

 Cleaning complete. Summary:
uci_android_permissions.csv.gz: shape=(29332, 87), remaining_NA=0
uci_indian_liver.csv.gz: shape=(583, 11), remaining_NA=0
uci_mushroom.csv.gz: shape=(8124, 24), remaining_NA=0
uci_phishing_url.c

In [6]:
# === Phase 1.5: dataset config (needed for Phase 2 and later) ===
TARGETS = {
    "uci_android_permissions.csv.gz": "result",
    "uci_indian_liver.csv.gz":        "has_liver_disease",
    "uci_mushroom.csv.gz":            "poisonous",
    "uci_phishing_url.csv.gz":        "label",
    "uci_secondary_mushroom.csv.gz":  "class",
}

DROP_COLS = {
    "uci_phishing_url.csv.gz": ["url", "title"],  # high-cardinality text cols, dropped in cleaning
}


In [7]:
# Phase 2 — Step 2 single split + smart preprocessing per dataset
try:
    from sklearn.preprocessing import TargetEncoder as SK_TargetEncoder  # sklearn ≥ 1.5
    _TargetEncoderClass = SK_TargetEncoder
    _te_is_sklearn = True
except Exception:
    _te_is_sklearn = False
    try:
        import category_encoders as ce  # pip install category_encoders (once)
        _TargetEncoderClass = ce.TargetEncoder
    except Exception as e:
        raise RuntimeError(
            "TargetEncoder not found. Install scikit-learn>=1.5 or `pip install category_encoders`."
        )

def _split_once(df, target, drops):
    df = df.drop(columns=[c for c in (drops or []) if c in df.columns], errors="ignore").copy()
    y = df[target]
    X = df.drop(columns=[target])
    if y.dtype == "object":
        uniq = sorted(y.dropna().unique().tolist())
        if len(uniq) == 2:
            y = y.map({uniq[0]: 0, uniq[1]: 1})
    return X, y.astype(int)

def _column_buckets(X):
    num = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]
    return num, cat

def _categorical_groups(X_train, cat_cols):
    bins, lows, highs = [], [], []
    for c in cat_cols:
        n = X_train[c].dropna().nunique()
        if n <= 2:
            bins.append(c)          # binary → label encode (0/1)
        elif n <= 10:
            lows.append(c)          # low-card → one-hot
        else:
            highs.append(c)         # high-card → target encode
    return bins, lows, highs

def _fit_binary_maps(X_train, bin_cols):
    maps = {}
    for c in bin_cols:
        uniq = list(pd.Series(X_train[c]).dropna().unique())
        if len(uniq) <= 1:
            maps[c] = {uniq[0]: 0} if len(uniq) == 1 else {}
        else:
            uniq_sorted = sorted(uniq, key=lambda x: str(x))
            maps[c] = {uniq_sorted[0]: 0, uniq_sorted[1]: 1}
    return maps

def _apply_binary_maps(X, maps):
    X2 = X.copy()
    for c, m in maps.items():
        X2[c] = X2[c].map(m).fillna(-1)  # unseen → -1
    return X2

def _build_preprocessor_for_train(Xtr, ytr, num_cols, bin_cols, low_cols, high_cols):
    transformers = []
    if num_cols:
        transformers.append(("num", StandardScaler(with_mean=False), num_cols))
    if bin_cols:
        transformers.append(("bin", "passthrough", bin_cols))
    if low_cols:
        # FIX: use sparse_output (new sklearn); returns dense arrays for downstream
        transformers.append(("low", OneHotEncoder(handle_unknown="ignore", sparse_output=False), low_cols))
    if high_cols:
        if _te_is_sklearn:
            te = _TargetEncoderClass()
        else:
            te = _TargetEncoderClass(smoothing=5.0)
        transformers.append(("high", te, high_cols))
    pre = ColumnTransformer(transformers=transformers, remainder="drop", sparse_threshold=0.0)
    pre.fit(Xtr, ytr)  # y passed for TargetEncoder
    return pre

prep2 = {}

for name, df in cleaned_datasets.items():
    if name not in TARGETS:
        continue

    # one split per dataset (reused across all models)
    X, y = _split_once(df, TARGETS[name], DROP_COLS.get(name, []))
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=0.2,
        stratify=y if y.nunique() == 2 else None,
        random_state=42
    )

    # decide encodings from TRAIN only
    num_cols, cat_cols = _column_buckets(Xtr)
    bin_cols, low_cols, high_cols = _categorical_groups(Xtr, cat_cols)

    # binary maps learned on train; apply to train & test
    bin_maps = _fit_binary_maps(Xtr, bin_cols)
    Xtr_b = _apply_binary_maps(Xtr, bin_maps)
    Xte_b = _apply_binary_maps(Xte, bin_maps)

    # column transformer (with TargetEncoder for high-card cats)
    pre = _build_preprocessor_for_train(Xtr_b, ytr, num_cols, bin_cols, low_cols, high_cols)

    # transform once → features for trees/XGB
    Xtr_tree = pre.transform(Xtr_b)
    Xte_tree = pre.transform(Xte_b)

    # final global scaler for linear models (LR/SVM)
    fin_scaler = StandardScaler(with_mean=False)
    Xtr_lin = fin_scaler.fit_transform(Xtr_tree)
    Xte_lin = fin_scaler.transform(Xte_tree)

    prep2[name] = {
        "ytr": ytr, "yte": yte,
        "Xtr_tree": Xtr_tree, "Xte_tree": Xte_tree,    # for DT/RF/XGB
        "Xtr_lin":  Xtr_lin,  "Xte_lin":  Xte_lin,     # for LR/SVM
        "pre": pre,
        "bin_maps": bin_maps,
        "cols": {"num": num_cols, "bin": bin_cols, "low": low_cols, "high": high_cols},
    }

print("Step 2 updated and completed (OneHotEncoder uses sparse_output=False).")


Step 2 updated and completed (OneHotEncoder uses sparse_output=False).


## Baseline Training Phase

In [8]:
# Phase 2 — Step 2.2: Baseline training for DT, RF, XGB, SVM, LR
# (exact code, only adds automatic pickle saving)

import pickle, time
from pathlib import Path


# Order (optional). Any not listed but present in prep2 will be appended.
DATASET_ORDER = [
    "uci_adroid_permission",
    "indian_liver",
    "uci_mushroom",
    "uci_phising",
    "uci_secondary_mushroom",
]

RUN_DIR = Path("model_Trained_data") / time.strftime("%Y%m%d-%H%M%S")
RUN_DIR.mkdir(parents=True, exist_ok=True)

SVC_TRAIN_CAP = 50_000  # optional safety cap

def _is_binary(y):
    try:
        return (y.nunique() == 2)
    except Exception:
        return (pd.Series(y).nunique() == 2)

def _evaluate(model, Xte, yte, binary):
    p = model.predict(Xte)
    pro = None
    if binary and hasattr(model, "predict_proba"):
        pro = model.predict_proba(Xte)[:, 1]
    return {
        "accuracy": accuracy_score(yte, p),
        "f1_weighted": f1_score(yte, p, average="weighted", zero_division=0),
        "f1_macro": f1_score(yte, p, average="macro", zero_division=0),
        "roc_auc": roc_auc_score(yte, pro) if (binary and pro is not None) else None,
        "pr_auc": average_precision_score(yte, pro) if (binary and pro is not None) else None,
    }

# Ensure we cover all datasets present in prep2
ordered_keys = [k for k in DATASET_ORDER if k in prep2] + [k for k in prep2.keys() if k not in DATASET_ORDER]

models_trained = {}   # holds fitted estimators for saving AFTER display
baseline_rows = []    # rows for the output table

for ds in ordered_keys:
    pack = prep2[ds]
    Xtr_tree, Xte_tree = pack["Xtr_tree"], pack["Xte_tree"]   # DT/RF/XGB
    Xtr_lin,  Xte_lin  = pack["Xtr_lin"],  pack["Xte_lin"]    # LR/SVC
    ytr, yte = pack["ytr"], pack["yte"]
    binary = _is_binary(ytr)

    print(f"\n===== {ds} — Step 2.2 baseline =====")
    models_trained.setdefault(ds, {})

    # Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(Xtr_tree, ytr)
    m = _evaluate(dt, Xte_tree, yte, binary)
    models_trained[ds]["DT"] = dt
    baseline_rows.append({"dataset": ds, "model": "DT", **m})

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
    rf.fit(Xtr_tree, ytr)
    m = _evaluate(rf, Xte_tree, yte, binary)
    models_trained[ds]["RF"] = rf
    baseline_rows.append({"dataset": ds, "model": "RF", **m})

    # XGBoost
    xgbc = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss",
                             random_state=42, n_jobs=-1)
    xgbc.fit(Xtr_tree, ytr)
    m = _evaluate(xgbc, Xte_tree, yte, binary)
    models_trained[ds]["XGB"] = xgbc
    baseline_rows.append({"dataset": ds, "model": "XGB", **m})

    # Logistic Regression
    lr = LogisticRegression(max_iter=3000, n_jobs=-1)
    lr.fit(Xtr_lin, ytr)
    m = _evaluate(lr, Xte_lin, yte, binary)
    models_trained[ds]["LR"] = lr
    baseline_rows.append({"dataset": ds, "model": "LR", **m})

    # SVC (RBF)
    if SVC_TRAIN_CAP and len(ytr) > SVC_TRAIN_CAP:
        idx = np.random.RandomState(42).choice(len(ytr), size=SVC_TRAIN_CAP, replace=False)
        Xtr_svc, ytr_svc = Xtr_lin[idx], ytr.iloc[idx]
    else:
        Xtr_svc, ytr_svc = Xtr_lin, ytr

    svc = SVC(kernel="rbf", probability=True, random_state=42)
    svc.fit(Xtr_svc, ytr_svc)
    m = _evaluate(svc, Xte_lin, yte, binary)
    models_trained[ds]["SVC"] = svc
    baseline_rows.append({"dataset": ds, "model": "SVC", **m})

# SHOW OUTPUT FIRST (Colab table)
baseline_df = pd.DataFrame(baseline_rows)
display(baseline_df.sort_values(["dataset", "f1_weighted", "accuracy"], ascending=[True, False, False]))

# THEN SAVE EVERYTHING TO PICKLE
for ds, model_dict in models_trained.items():
    ds_dir = RUN_DIR / ds
    ds_dir.mkdir(parents=True, exist_ok=True)
    for model_name, est in model_dict.items():
        with open(ds_dir / f"{model_name}.pkl", "wb") as f:
            pickle.dump(est, f)

print(f"\n Trained baseline models saved under: {RUN_DIR}")



===== uci_android_permissions.csv.gz — Step 2.2 baseline =====

===== uci_indian_liver.csv.gz — Step 2.2 baseline =====

===== uci_mushroom.csv.gz — Step 2.2 baseline =====

===== uci_phishing_url.csv.gz — Step 2.2 baseline =====

===== uci_secondary_mushroom.csv.gz — Step 2.2 baseline =====


Unnamed: 0,dataset,model,accuracy,f1_weighted,f1_macro,roc_auc,pr_auc
1,uci_android_permissions.csv.gz,RF,0.969661,0.969661,0.969661,0.992672,0.993714
2,uci_android_permissions.csv.gz,XGB,0.967275,0.967274,0.967274,0.993537,0.99401
4,uci_android_permissions.csv.gz,SVC,0.963866,0.963866,0.963866,0.988229,0.98914
0,uci_android_permissions.csv.gz,DT,0.962502,0.962501,0.962501,0.974796,0.966635
3,uci_android_permissions.csv.gz,LR,0.957559,0.957559,0.957559,0.988026,0.988941
6,uci_indian_liver.csv.gz,RF,0.752137,0.727332,0.646305,0.756201,0.898245
8,uci_indian_liver.csv.gz,LR,0.735043,0.67536,0.558383,0.830971,0.934957
7,uci_indian_liver.csv.gz,XGB,0.700855,0.670918,0.573126,0.727853,0.886988
9,uci_indian_liver.csv.gz,SVC,0.709402,0.588803,0.415,0.692417,0.873874
5,uci_indian_liver.csv.gz,DT,0.547009,0.540576,0.435503,0.437633,0.684956



 Trained baseline models saved under: model_Trained_data/20251014-134444


In [16]:
# Rebuild baseline table (if needed), pick winners, and save best_baseline.pkl
# Uses in-memory `baseline_rows` + `models_trained`. Writes winners to a clean export folder.


# 1) Rebuild baseline_df if it's missing
if 'baseline_df' not in globals() or baseline_df is None:
    if 'baseline_rows' in globals() and baseline_rows:
        baseline_df = pd.DataFrame(baseline_rows)
        print("[info] Rebuilt baseline_df from baseline_rows.")
    else:
        raise RuntimeError("No baseline_df and baseline_rows is empty—nothing to select from.")

# 2) Resolve column names (robust to minor diffs)
def _col(df, options):
    low = {c.lower(): c for c in df.columns}
    for o in options:
        if o in low: return low[o]
    raise KeyError(f"Missing expected columns among: {options}")

c_ds  = _col(baseline_df, ["dataset"])
c_m   = _col(baseline_df, ["model"])
c_f1  = _col(baseline_df, ["f1_weighted","f1","f1_w"])
c_acc = _col(baseline_df, ["accuracy","acc"])

# 3) Pick winner per dataset (primary = f1_weighted, tie-break = accuracy)
winners = []
for ds, g in baseline_df.groupby(c_ds):
    g2 = g.copy()
    if c_f1 not in g2: g2[c_f1] = np.nan
    if c_acc not in g2: g2[c_acc] = np.nan
    r = g2.sort_values([c_f1, c_acc], ascending=False).iloc[0]
    winners.append({
        "dataset": ds,
        "best_model_2_2": r[c_m],
        "best_f1_2_2": r[c_f1],
        "best_acc_2_2": r[c_acc],
    })

winners_df = pd.DataFrame(winners).sort_values("dataset")
display(winners_df)

# 4) Save winners only to a clean export folder (no dependency on prior RUN_DIR)
EXPORT_DIR = Path("model_BEST_exports") / time.strftime("%Y%m%d-%H%M%S") / "baseline"
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

# Use in-memory models if available; else skip that dataset
missing = []
for _, row in winners_df.iterrows():
    ds, mdl = row["dataset"], row["best_model_2_2"]
    if 'models_trained' in globals() and ds in models_trained and mdl in models_trained[ds]:
        out_dir = EXPORT_DIR / ds
        out_dir.mkdir(parents=True, exist_ok=True)
        with open(out_dir / "best_baseline.pkl", "wb") as f:
            pickle.dump(models_trained[ds][mdl], f)
        print(f"[{ds}] saved -> {out_dir/'best_baseline.pkl'} ({mdl})")
    else:
        missing.append((ds, mdl))

if missing:
    print("\n Skipped (model object not in memory):")
    for ds, mdl in missing:
        print(f" - {ds} / {mdl} (re-run Step 2.2 cell to repopulate models_trained, or point me to the baseline run folder)")

print(f"\n Baseline winners exported to: {EXPORT_DIR}")

# 5) Quick tally: which model won most often
print("\nWinner count by model:")
display(winners_df["best_model_2_2"].value_counts().rename_axis("model").reset_index(name="wins"))



[info] Rebuilt baseline_df from baseline_rows.


Unnamed: 0,dataset,best_model_2_2,best_f1_2_2,best_acc_2_2
0,uci_android_permissions.csv.gz,RF,0.969661,0.969661
1,uci_indian_liver.csv.gz,RF,0.727332,0.752137
2,uci_mushroom.csv.gz,DT,1.0,1.0
3,uci_phishing_url.csv.gz,SVC,0.999215,0.999215
4,uci_secondary_mushroom.csv.gz,RF,0.999918,0.999918


[uci_android_permissions.csv.gz] saved -> model_BEST_exports/20251014-182559/baseline/uci_android_permissions.csv.gz/best_baseline.pkl (RF)
[uci_indian_liver.csv.gz] saved -> model_BEST_exports/20251014-182559/baseline/uci_indian_liver.csv.gz/best_baseline.pkl (RF)
[uci_mushroom.csv.gz] saved -> model_BEST_exports/20251014-182559/baseline/uci_mushroom.csv.gz/best_baseline.pkl (DT)
[uci_phishing_url.csv.gz] saved -> model_BEST_exports/20251014-182559/baseline/uci_phishing_url.csv.gz/best_baseline.pkl (SVC)
[uci_secondary_mushroom.csv.gz] saved -> model_BEST_exports/20251014-182559/baseline/uci_secondary_mushroom.csv.gz/best_baseline.pkl (RF)

 Baseline winners exported to: model_BEST_exports/20251014-182559/baseline

Winner count by model:


Unnamed: 0,model,wins
0,RF,3
1,DT,1
2,SVC,1


## This phase is where we fine tuned by the use of hyper parameters


In [10]:
# === Step 2.3 — Balanced hyperparameter tuning for DT, RF, XGB, LR, SVC ===
# Assumes already available in the notebook:
# time, Path, pickle, np, pd, display
# accuracy_score, f1_score, roc_auc_score, average_precision_score
# StratifiedKFold, RandomizedSearchCV
# DecisionTreeClassifier, RandomForestClassifier, LogisticRegression, SVC, xgb
# and your prep2 dict with: Xtr_tree, Xte_tree, Xtr_lin, Xte_lin, ytr, yte

# Datasets order (any extras in prep2 are appended)
DATASET_ORDER = [
    "uci_adroid_permission",
    "indian_liver",
    "uci_mushroom",
    "uci_phising",
    "uci_secondary_mushroom",
]

RUN_DIR = Path("Model_Tuned_data") / time.strftime("%Y%m%d-%H%M%S")
RUN_DIR.mkdir(parents=True, exist_ok=True)

PRIMARY_SCORING = "f1_weighted"
CV_FOLDS = 3
N_JOBS_CV = 1           # safer for Colab RAM
SVC_TRAIN_CAP = 20_000  # optional cap only for SVC search (refit uses full data)

def _is_binary(y):
    try:
        return (y.nunique() == 2)
    except Exception:
        return (pd.Series(y).nunique() == 2)

def _evaluate(model, Xte, yte, binary):
    p = model.predict(Xte)
    met = {
        "accuracy": accuracy_score(yte, p),
        "f1_weighted": f1_score(yte, p, average="weighted", zero_division=0),
        "f1_macro": f1_score(yte, p, average="macro", zero_division=0),
        "roc_auc": None,
        "pr_auc": None
    }
    if binary:
        pro = None
        if hasattr(model, "predict_proba"):
            try: pro = model.predict_proba(Xte)[:, 1]
            except: pro = None
        elif hasattr(model, "decision_function"):
            try: pro = model.decision_function(Xte)
            except: pro = None
        if pro is not None:
            try:
                met["roc_auc"] = roc_auc_score(yte, pro)
                met["pr_auc"] = average_precision_score(yte, pro)
            except:
                pass
    return met

# Compact, balanced search spaces (kept small for Colab)
DT_GRID  = {"max_depth":[None,5,10,20], "min_samples_split":[2,5,10], "min_samples_leaf":[1,2,4], "criterion":["gini","entropy","log_loss"]}
RF_GRID  = {"n_estimators":[200,400], "max_depth":[None,10,20], "min_samples_split":[2,5,10], "min_samples_leaf":[1,2,4], "max_features":["sqrt","log2", None]}
XGB_GRID = {"max_depth":[3,5,7], "learning_rate":[0.03,0.1], "subsample":[0.8,1.0], "colsample_bytree":[0.8,1.0], "min_child_weight":[1,3], "reg_alpha":[0,0.01], "reg_lambda":[1.0,2.0]}
LR_GRID  = {"C":[0.1,1.0,10.0], "penalty":["l1","l2","elasticnet"], "l1_ratio":[0.0,0.5,1.0]}
SVC_GRID = {"C":[0.1,1,10], "gamma":["scale", 0.1, 0.01]}

# Cover all prep2 keys
ordered_keys = [k for k in DATASET_ORDER if k in prep2] + [k for k in prep2.keys() if k not in DATASET_ORDER]

tuned_rows = []
saved_paths = {}

for ds in ordered_keys:
    pack = prep2[ds]
    Xtr_tree, Xte_tree = pack["Xtr_tree"], pack["Xte_tree"]   # DT/RF/XGB
    Xtr_lin,  Xte_lin  = pack["Xtr_lin"],  pack["Xte_lin"]    # LR/SVC
    ytr, yte = pack["ytr"], pack["yte"]
    binary = _is_binary(ytr)
    n_classes = (2 if binary else pd.Series(ytr).nunique())

    print(f"\n===== {ds} — Step 2.3 (balanced tuning for all models) =====")

    # Optional: shrink floats to save RAM (safe no-op for non-float arrays)
    for arr_name in ["Xtr_tree","Xte_tree","Xtr_lin","Xte_lin"]:
        try: pack[arr_name] = pack[arr_name].astype(np.float32, copy=False)
        except: pass
    Xtr_tree, Xte_tree, Xtr_lin, Xte_lin = pack["Xtr_tree"], pack["Xte_tree"], pack["Xtr_lin"], pack["Xte_lin"]

    cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=42)
    ds_dir = RUN_DIR / ds
    ds_dir.mkdir(parents=True, exist_ok=True)
    saved_paths[ds] = {}

    # --- Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt_search = RandomizedSearchCV(dt, DT_GRID, n_iter=12, scoring=PRIMARY_SCORING, cv=cv, n_jobs=N_JOBS_CV, refit=True, random_state=42)
    dt_search.fit(Xtr_tree, ytr)
    # Refit best on full train (already refit=True, but do once more for clarity)
    dt_best = dt_search.best_estimator_
    dt_best.fit(Xtr_tree, ytr)
    met = _evaluate(dt_best, Xte_tree, yte, binary)
    with open(ds_dir / "DT.pkl", "wb") as f: pickle.dump(dt_best, f)
    tuned_rows.append({"dataset": ds, "model": "DT", **met})
    saved_paths[ds]["DT"] = str(ds_dir / "DT.pkl")

    # --- Random Forest
    rf = RandomForestClassifier(random_state=42, n_jobs=-1, class_weight="balanced_subsample")
    rf_search = RandomizedSearchCV(rf, RF_GRID, n_iter=14, scoring=PRIMARY_SCORING, cv=cv, n_jobs=N_JOBS_CV, refit=True, random_state=42)
    rf_search.fit(Xtr_tree, ytr)
    rf_best = rf_search.best_estimator_
    rf_best.fit(Xtr_tree, ytr)
    met = _evaluate(rf_best, Xte_tree, yte, binary)
    with open(ds_dir / "RF.pkl", "wb") as f: pickle.dump(rf_best, f)
    tuned_rows.append({"dataset": ds, "model": "RF", **met})
    saved_paths[ds]["RF"] = str(ds_dir / "RF.pkl")

    # --- XGBoost
    xgbc = xgb.XGBClassifier(
        tree_method="hist",
        n_estimators=400,
        eval_metric="logloss",
        objective=("binary:logistic" if binary else "multi:softprob"),
        num_class=(None if binary else n_classes),
        random_state=42,
        n_jobs=-1
    )
    xgb_search = RandomizedSearchCV(xgbc, XGB_GRID, n_iter=14, scoring=PRIMARY_SCORING, cv=cv, n_jobs=N_JOBS_CV, refit=True, random_state=42)
    xgb_search.fit(Xtr_tree, ytr)
    xgb_best = xgb_search.best_estimator_
    xgb_best.fit(Xtr_tree, ytr)
    met = _evaluate(xgb_best, Xte_tree, yte, binary)
    with open(ds_dir / "XGB.pkl", "wb") as f: pickle.dump(xgb_best, f)
    tuned_rows.append({"dataset": ds, "model": "XGB", **met})
    saved_paths[ds]["XGB"] = str(ds_dir / "XGB.pkl")

    # --- Logistic Regression (saga)
    lr = LogisticRegression(max_iter=3000, solver="saga", class_weight="balanced", n_jobs=-1)
    lr_search = RandomizedSearchCV(lr, LR_GRID, n_iter=12, scoring=PRIMARY_SCORING, cv=cv, n_jobs=N_JOBS_CV, refit=True, random_state=42)
    lr_search.fit(Xtr_lin, ytr)
    lr_best = lr_search.best_estimator_
    lr_best.fit(Xtr_lin, ytr)
    met = _evaluate(lr_best, Xte_lin, yte, binary)
    with open(ds_dir / "LR.pkl", "wb") as f: pickle.dump(lr_best, f)
    tuned_rows.append({"dataset": ds, "model": "LR", **met})
    saved_paths[ds]["LR"] = str(ds_dir / "LR.pkl")

    # --- SVC (balanced handling)
    # Tune on (optionally) capped set for speed, but refit best on FULL train
    if SVC_TRAIN_CAP and len(ytr) > SVC_TRAIN_CAP:
        rng = np.random.RandomState(42)
        idx = rng.choice(len(ytr), size=SVC_TRAIN_CAP, replace=False)
        Xtr_svc_tune, ytr_svc_tune = Xtr_lin[idx], ytr.iloc[idx]
    else:
        Xtr_svc_tune, ytr_svc_tune = Xtr_lin, ytr

    svc_tune = SVC(kernel="rbf", probability=False, class_weight="balanced", random_state=42)
    svc_search = RandomizedSearchCV(svc_tune, SVC_GRID, n_iter=8, scoring=PRIMARY_SCORING, cv=cv, n_jobs=N_JOBS_CV, refit=True, random_state=42)
    svc_search.fit(Xtr_svc_tune, ytr_svc_tune)
    best_params = svc_search.best_params_

    # Refit once on FULL train with probability=True (for ROC/PR + saving)
    svc_best = SVC(kernel="rbf", probability=True, class_weight="balanced", random_state=42, **best_params)
    svc_best.fit(Xtr_lin, ytr)
    met = _evaluate(svc_best, Xte_lin, yte, binary)
    with open(ds_dir / "SVC.pkl", "wb") as f: pickle.dump(svc_best, f)
    tuned_rows.append({"dataset": ds, "model": "SVC", **met})
    saved_paths[ds]["SVC"] = str(ds_dir / "SVC.pkl")


tuned_df = pd.DataFrame(tuned_rows)
display(tuned_df.sort_values(["dataset", "f1_weighted", "accuracy"], ascending=[True, False, False]))

# —— THEN SAVE REGISTRY + BEST-MODEL ALIASES
reg_path = RUN_DIR / "tuned_registry.csv"
tuned_df.to_csv(reg_path, index=False)

for ds in ordered_keys:
    df_ds = tuned_df[tuned_df["dataset"] == ds]
    if len(df_ds) == 0:
        continue
    best_row = df_ds.sort_values(["f1_weighted", "accuracy"], ascending=False).iloc[0]
    src = Path(saved_paths[ds][best_row["model"]])
    dst = RUN_DIR / ds / "best_model.pkl"
    with open(src, "rb") as s, open(dst, "wb") as d:
        d.write(s.read())
    print(f"[{ds}] Best tuned model: {best_row['model']} (F1_w={best_row['f1_weighted']:.4f}) -> {dst}")

print(f"\n Tuned models & registry saved under: {RUN_DIR}")



===== uci_android_permissions.csv.gz — Step 2.3 (balanced tuning for all models) =====

===== uci_indian_liver.csv.gz — Step 2.3 (balanced tuning for all models) =====

===== uci_mushroom.csv.gz — Step 2.3 (balanced tuning for all models) =====

===== uci_phishing_url.csv.gz — Step 2.3 (balanced tuning for all models) =====

===== uci_secondary_mushroom.csv.gz — Step 2.3 (balanced tuning for all models) =====


Unnamed: 0,dataset,model,accuracy,f1_weighted,f1_macro,roc_auc,pr_auc
1,uci_android_permissions.csv.gz,RF,0.96949,0.96949,0.96949,0.99281,0.993838
2,uci_android_permissions.csv.gz,XGB,0.967445,0.967444,0.967445,0.993755,0.994164
4,uci_android_permissions.csv.gz,SVC,0.965229,0.965229,0.965229,0.989224,0.990454
0,uci_android_permissions.csv.gz,DT,0.964036,0.964033,0.964034,0.987065,0.985395
3,uci_android_permissions.csv.gz,LR,0.957559,0.957559,0.957559,0.988055,0.989121
8,uci_indian_liver.csv.gz,LR,0.726496,0.739057,0.713411,0.832034,0.936473
6,uci_indian_liver.csv.gz,RF,0.752137,0.735922,0.663693,0.809709,0.922406
9,uci_indian_liver.csv.gz,SVC,0.692308,0.706289,0.679939,0.843373,0.936599
7,uci_indian_liver.csv.gz,XGB,0.717949,0.684041,0.586307,0.716867,0.878844
5,uci_indian_liver.csv.gz,DT,0.57265,0.568718,0.472403,0.477321,0.700585


[uci_android_permissions.csv.gz] Best tuned model: RF (F1_w=0.9695) -> Model_Tuned_data/20251014-150304/uci_android_permissions.csv.gz/best_model.pkl
[uci_indian_liver.csv.gz] Best tuned model: LR (F1_w=0.7391) -> Model_Tuned_data/20251014-150304/uci_indian_liver.csv.gz/best_model.pkl
[uci_mushroom.csv.gz] Best tuned model: RF (F1_w=1.0000) -> Model_Tuned_data/20251014-150304/uci_mushroom.csv.gz/best_model.pkl
[uci_phishing_url.csv.gz] Best tuned model: SVC (F1_w=0.9994) -> Model_Tuned_data/20251014-150304/uci_phishing_url.csv.gz/best_model.pkl
[uci_secondary_mushroom.csv.gz] Best tuned model: RF (F1_w=0.9999) -> Model_Tuned_data/20251014-150304/uci_secondary_mushroom.csv.gz/best_model.pkl

 Tuned models & registry saved under: Model_Tuned_data/20251014-150304


In [17]:
# Pick the best tuned model per dataset and export
# Uses in-memory tuned_df (or tuned_rows)

from pathlib import Path
import time, pickle
import pandas as pd
import numpy as np

# 1) Get / rebuild tuned_df
if 'tuned_df' in globals() and isinstance(tuned_df, pd.DataFrame) and len(tuned_df):
    print("[info] Using in-memory 'tuned_df'.")
elif 'tuned_rows' in globals() and tuned_rows:
    tuned_df = pd.DataFrame(tuned_rows)
    print("[info] Rebuilt 'tuned_df' from tuned_rows.")
else:
    raise RuntimeError("No tuned_df or tuned_rows found. Please run your Step 2.3 cell first.")

# 2) Resolve columns (robust to slight naming differences)
def _col(df, options):
    low = {c.lower(): c for c in df.columns}
    for o in options:
        if o in low: return low[o]
    raise KeyError(f"Missing expected columns among: {options}")

c_ds  = _col(tuned_df, ["dataset"])
c_m   = _col(tuned_df, ["model"])
c_f1  = _col(tuned_df, ["f1_weighted","f1","f1_w"])
c_acc = _col(tuned_df, ["accuracy","acc"])
c_path = None
try:
    c_path = _col(tuned_df, ["path","pickle_path"])
except KeyError:
    pass  # optional

# 3) Pick winner per dataset (primary = f1_weighted, tie-break = accuracy)
winners = []
for ds, g in tuned_df.groupby(c_ds):
    g2 = g.copy()
    if c_f1 not in g2: g2[c_f1] = np.nan
    if c_acc not in g2: g2[c_acc] = np.nan
    r = g2.sort_values([c_f1, c_acc], ascending=False).iloc[0]
    winners.append({
        "dataset": ds,
        "best_model_2_3": r[c_m],
        "best_f1_2_3": r[c_f1],
        "best_acc_2_3": r[c_acc],
        "path_hint": (r[c_path] if c_path in g2 else None) if c_path else None
    })

winners_tuned_df = pd.DataFrame(winners).sort_values("dataset")
display(winners_tuned_df)

# 4) Export winners only to a clean export folder
EXPORT_DIR = Path("model_BEST_exports") / time.strftime("%Y%m%d-%H%M%S") / "tuned"
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

# Helper to resolve a source pickle path for (ds, model)
def _resolve_src(ds, mdl):
    # 4.1) saved_paths from your tuning cell
    if 'saved_paths' in globals() and isinstance(saved_paths, dict):
        p = saved_paths.get(ds, {}).get(mdl, None)
        if isinstance(p, str) and Path(p).exists():
            return Path(p)
    # 4.2) RUN_DIR/<ds>/<Model>.pkl
    if 'RUN_DIR' in globals():
        cand = Path(RUN_DIR) / ds / f"{mdl}.pkl"
        if cand.exists():
            return cand
        # 4.3) fallback to best_model.pkl in the same folder
        alt = Path(RUN_DIR) / ds / "best_model.pkl"
        if alt.exists():
            return alt
    # 4.4) final fallback: any path hint stored in the registry row
    row = winners_tuned_df[winners_tuned_df["dataset"] == ds].iloc[0]
    ph = row.get("path_hint", None)
    if isinstance(ph, str) and Path(ph).exists():
        return Path(ph)
    return None

# Copy winners
missing = []
for _, row in winners_tuned_df.iterrows():
    ds, mdl = row["dataset"], row["best_model_2_3"]
    src = _resolve_src(ds, mdl)
    if src is None:
        missing.append((ds, mdl))
        continue
    out_dir = EXPORT_DIR / ds
    out_dir.mkdir(parents=True, exist_ok=True)
    with open(src, "rb") as s, open(out_dir / "best_tuned.pkl", "wb") as d:
        d.write(s.read())
    print(f"[{ds}] saved -> {out_dir/'best_tuned.pkl'} ({mdl})")

if missing:
    print("\n Skipped (couldn't locate pickle on disk):")
    for ds, mdl in missing:
        print(f" - {ds} / {mdl}  (set RUN_DIR properly or populate saved_paths)")

print(f"\n Tuned winners exported to: {EXPORT_DIR}")

# 5) Quick tally: which tuned model wins most often
print("\nWinner count by model (tuned 2.3):")
display(winners_tuned_df["best_model_2_3"].value_counts().rename_axis("model").reset_index(name="wins"))


[info] Using in-memory 'tuned_df'.


Unnamed: 0,dataset,best_model_2_3,best_f1_2_3,best_acc_2_3,path_hint
0,uci_android_permissions.csv.gz,RF,0.96949,0.96949,
1,uci_indian_liver.csv.gz,LR,0.739057,0.726496,
2,uci_mushroom.csv.gz,RF,1.0,1.0,
3,uci_phishing_url.csv.gz,SVC,0.999449,0.999449,
4,uci_secondary_mushroom.csv.gz,RF,0.999918,0.999918,


[uci_android_permissions.csv.gz] saved -> model_BEST_exports/20251014-182853/tuned/uci_android_permissions.csv.gz/best_tuned.pkl (RF)
[uci_indian_liver.csv.gz] saved -> model_BEST_exports/20251014-182853/tuned/uci_indian_liver.csv.gz/best_tuned.pkl (LR)
[uci_mushroom.csv.gz] saved -> model_BEST_exports/20251014-182853/tuned/uci_mushroom.csv.gz/best_tuned.pkl (RF)
[uci_phishing_url.csv.gz] saved -> model_BEST_exports/20251014-182853/tuned/uci_phishing_url.csv.gz/best_tuned.pkl (SVC)
[uci_secondary_mushroom.csv.gz] saved -> model_BEST_exports/20251014-182853/tuned/uci_secondary_mushroom.csv.gz/best_tuned.pkl (RF)

 Tuned winners exported to: model_BEST_exports/20251014-182853/tuned

Winner count by model (tuned 2.3):


Unnamed: 0,model,wins
0,RF,3
1,LR,1
2,SVC,1
