<a href="https://colab.research.google.com/github/JoshuaGottlieb/SHAP-Feature-Selection/blob/main/5_datasets_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
up = files.upload()   # select folder.zip
!unzip folder.zip -d /content/dataset


Saving uci_android_permissions.csv.gz to uci_android_permissions.csv.gz
Saving uci_indian_liver.csv.gz to uci_indian_liver.csv.gz
Saving uci_mushroom.csv.gz to uci_mushroom.csv.gz
Saving uci_phishing_url.csv.gz to uci_phishing_url.csv.gz
Saving uci_secondary_mushroom.csv.gz to uci_secondary_mushroom.csv.gz
unzip:  cannot find or open folder.zip, folder.zip.zip or folder.zip.ZIP.


In [2]:
import pandas as pd

# List of dataset filenames
files = [
    "uci_android_permissions.csv.gz",
    "uci_indian_liver.csv.gz",
    "uci_mushroom.csv.gz",
    "uci_phishing_url.csv.gz",
    "uci_secondary_mushroom.csv.gz"
]

# Dictionary to store DataFrames
datasets = {}

# Load and show first 5 rows of each
for f in files:
    print(f"\n===== {f} =====")
    df = pd.read_csv(f)
    datasets[f] = df   # save in dictionary
    print(df.head())   # display first 5 rows
    print("-" * 60)



===== uci_android_permissions.csv.gz =====
   android.permission.get_accounts  \
0                                0   
1                                0   
2                                0   
3                                0   
4                                0   

   com.sonyericsson.home.permission.broadcast_badge  \
0                                                 0   
1                                                 0   
2                                                 0   
3                                                 0   
4                                                 0   

   android.permission.read_profile  android.permission.manage_accounts  \
0                                0                                   0   
1                                0                                   0   
2                                0                                   0   
3                                0                                   0   
4                         

In [3]:
# Check for missing values in each dataset
for name, df in datasets.items():
    print(f"\n===== Missing values in {name} =====")
    print(df.isnull().sum())
    print("-" * 60)



===== Missing values in uci_android_permissions.csv.gz =====
android.permission.get_accounts                                           0
com.sonyericsson.home.permission.broadcast_badge                          0
android.permission.read_profile                                           0
android.permission.manage_accounts                                        0
android.permission.write_sync_settings                                    0
                                                                         ..
com.google.android.finsky.permission.bind_get_install_referrer_service    0
com.huawei.android.launcher.permission.read_settings                      0
android.permission.read_sms                                               0
android.permission.process_incoming_calls                                 0
result                                                                    0
Length: 87, dtype: int64
------------------------------------------------------------

===== Missing v

In [4]:
# Function to drop unnecessary columns
def drop_noisy_columns(df):
    # Drop "Unnamed" or index-like columns
    noisy = [col for col in df.columns if "Unnamed" in col or "file_name" in col]
    return df.drop(columns=noisy, errors="ignore")

# Apply cleaning + check missingness
for name, df in datasets.items():
    print(f"\n===== {name} =====")
    df = drop_noisy_columns(df)
    datasets[name] = df   # save cleaned version

    print("Shape:", df.shape)
    print("Missing values per column:")
    print(df.isnull().sum()[df.isnull().sum() > 0])
    print("-" * 60)


===== uci_android_permissions.csv.gz =====
Shape: (29332, 87)
Missing values per column:
Series([], dtype: int64)
------------------------------------------------------------

===== uci_indian_liver.csv.gz =====
Shape: (583, 11)
Missing values per column:
almumin_globulin_ratio    4
dtype: int64
------------------------------------------------------------

===== uci_mushroom.csv.gz =====
Shape: (8124, 23)
Missing values per column:
Series([], dtype: int64)
------------------------------------------------------------

===== uci_phishing_url.csv.gz =====
Shape: (235795, 55)
Missing values per column:
Series([], dtype: int64)
------------------------------------------------------------

===== uci_secondary_mushroom.csv.gz =====
Shape: (61069, 21)
Missing values per column:
cap_surface          14120
gill_attachment       9884
gill_spacing         25063
stem_root            51538
stem_surface         38124
veil_type            57892
veil_color           53656
ring_type             2471
sp

In [6]:
def clean_missing_values_safe(df, threshold=0.8, name=""):
    """
    - Drops columns with > threshold missingness
    - Imputes numeric with median; categorical with mode
    - Avoids chained assignment warnings
    - Returns cleaned copy and list of dropped columns
    """
    df = df.copy()

    miss_frac = df.isnull().mean()
    dropped = miss_frac[miss_frac > threshold].index.tolist()
    if dropped:
        print(f"[{name}] Dropping (> {int(threshold*100)}% missing): {dropped}")
        df = df.drop(columns=dropped, errors="ignore")

    # Impute remaining columns
    for col in df.columns:
        if df[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df[col]):
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df[col].fillna(df[col].mode().iloc[0])

    return df, dropped

# Re-run safe cleaner and store results
cleaned = {}
dropped_report = {}

for name, df in datasets.items():
    out, dropped_cols = clean_missing_values_safe(df, threshold=0.8, name=name)
    cleaned[name] = out
    dropped_report[name] = dropped_cols
    print(f"\n===== {name} (safe) =====")
    print("Shape:", out.shape, "| Remaining NA:", int(out.isnull().sum().sum()))
    print("-"*60)

# Optional: inspect what was dropped where
dropped_report


===== uci_android_permissions.csv.gz (safe) =====
Shape: (29332, 87) | Remaining NA: 0
------------------------------------------------------------

===== uci_indian_liver.csv.gz (safe) =====
Shape: (583, 11) | Remaining NA: 0
------------------------------------------------------------

===== uci_mushroom.csv.gz (safe) =====
Shape: (8124, 23) | Remaining NA: 0
------------------------------------------------------------

===== uci_phishing_url.csv.gz (safe) =====
Shape: (235795, 55) | Remaining NA: 0
------------------------------------------------------------

===== uci_secondary_mushroom.csv.gz (safe) =====
Shape: (61069, 17) | Remaining NA: 0
------------------------------------------------------------


{'uci_android_permissions.csv.gz': [],
 'uci_indian_liver.csv.gz': [],
 'uci_mushroom.csv.gz': [],
 'uci_phishing_url.csv.gz': [],
 'uci_secondary_mushroom.csv.gz': []}

In [9]:

X_liver, y_liver = encode_dataset(
    cleaned["uci_indian_liver.csv.gz"],
    TARGETS["uci_indian_liver.csv.gz"],
    DROP_COLS.get("uci_indian_liver.csv.gz", [])
)


  if (y.dtype == "object" or pd.api.types.is_categorical_dtype(y)) and y.nunique() == 2:


In [17]:
from sklearn import __version__ as skl_version

def build_preprocessor_streaming(X_sample):
    """Preprocessor for BIG datasets: impute + one-hot + scale (sparse)."""
    cat = [c for c in X_sample.columns if X_sample[c].dtype == "object"]
    num = [c for c in X_sample.columns if X_sample[c].dtype != "object"]

    # Version-aware OneHotEncoder
    if tuple(map(int, skl_version.split(".")[:2])) >= (1, 6):
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    else:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

    pre = ColumnTransformer(
        [
            ("num",  SimpleImputer(strategy="median"), num),
            ("cat",  ohe, cat),
        ],
        sparse_threshold=1.0
    )
    pre.fit(X_sample)
    return pre


In [7]:
# Baseline modeling for 5 datasets
# - Small (<= ROW_THRESHOLD): in-memory RF, XGB, Logistic (with imputation + OHE)
# - Big    (> ROW_THRESHOLD): chunked SGD & Passive-Aggressive (+ small XGB cap)
!pip -q install xgboost

import pandas as pd, numpy as np, warnings
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import xgboost as xgb
from scipy import sparse
warnings.filterwarnings("ignore")

# ---------------- Config ----------------
RS = 42
ROW_THRESHOLD = 100_000   # only chunk when rows > this
SAMPLE = 5_000            # rows to fit encoders / holdout for BIG
CHUNK  = 50_000           # chunk size for streaming
XGB_CAP = 60_000          # rows for small XGB on BIG datasets

TARGETS = {
    "uci_android_permissions.csv.gz": "result",
    "uci_indian_liver.csv.gz":        "has_liver_disease",
    "uci_mushroom.csv.gz":            "poisonous",
    "uci_phishing_url.csv.gz":        "label",
    "uci_secondary_mushroom.csv.gz":  "class",
}
DROP_COLS = {
    "uci_android_permissions.csv.gz": [],
    "uci_indian_liver.csv.gz":        [],
    "uci_mushroom.csv.gz":            [],
    "uci_phishing_url.csv.gz":        ["url", "title"],  # drop free-text
    "uci_secondary_mushroom.csv.gz":  [],
}

# ---------------- Helpers ----------------
def count_rows(csv_path, target_col):
    total = 0
    for ch in pd.read_csv(csv_path, usecols=[target_col], chunksize=200_000, low_memory=False):
        total += len(ch)
    return total

def discover_classes(csv_path, target):
    seen = set()
    for ch in pd.read_csv(csv_path, usecols=[target], chunksize=200_000, low_memory=False):
        seen.update(ch[target].dropna().unique().tolist())
        if len(seen) > 1:
            break
    return sorted(seen)

def make_label_map(classes):
    # deterministic 0..k-1 mapping (handles string labels like 'e'/'p')
    return {lab: i for i, lab in enumerate(classes)}

def split_Xy(df, target, drop_cols=None, label_map=None):
    df = df.drop(columns=[c for c in (drop_cols or []) if c in df.columns], errors="ignore")
    y = df[target]
    X = df.drop(columns=[target])
    if label_map is not None:
        y = y.map(label_map)
    return X, y.astype(int)

def build_preprocessor_small(X_df):
    """
    SMALL (in-memory): median-impute + OneHot (version-agnostic).
    Output may be sparse or dense depending on sklearn internals; we handle both.
    """
    cat = [c for c in X_df.columns if X_df[c].dtype == "object"]
    num = [c for c in X_df.columns if X_df[c].dtype != "object"]
    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
        ],
        sparse_threshold=1.0
    )
    pre.fit(X_df)
    return pre

def build_preprocessor_streaming(X_sample):
    """
    BIG (chunked): median-impute + OneHot (version-agnostic).
    Keep sparse path friendly for incremental learners and XGBoost DMatrix.
    """
    cat = [c for c in X_sample.columns if X_sample[c].dtype == "object"]
    num = [c for c in X_sample.columns if X_sample[c].dtype != "object"]
    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num),
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
        ],
        sparse_threshold=1.0
    )
    pre.fit(X_sample)
    return pre

def to_dense_if_sparse(X):
    return X.toarray() if sparse.issparse(X) else X

def safe_auc(y_true, proba):
    if proba is None: return None
    y_true = pd.Series(y_true)
    return roc_auc_score(y_true, proba) if y_true.nunique() == 2 else None

# ---------------- Main ----------------
results = []

for name, target in TARGETS.items():
    print(f"\n===== {name} =====")
    n_rows  = count_rows(name, target)
    classes = discover_classes(name, target)
    if len(classes) < 2:
        print(f"Skip: single-class target ({classes})")
        results.append({"dataset": name, "note": "single-class; skipped"})
        continue

    label_map = make_label_map(classes)
    is_big = n_rows > ROW_THRESHOLD
    print(f"Rows={n_rows:,} → {'BIG (chunked)' if is_big else 'small (in-memory)'}; classes={classes}")

    if not is_big:
        # ---------- SMALL (in-memory) ----------
        df = pd.read_csv(name, low_memory=False)
        X_raw, y = split_Xy(df, target, DROP_COLS.get(name, []), label_map)

        # Impute + OneHot
        pre = build_preprocessor_small(X_raw)
        X = pre.transform(X_raw)

        # RF needs dense; LR/XGB accept sparse but dense is fine for small sets
        X_dense = to_dense_if_sparse(X)

        Xtr, Xte, ytr, yte = train_test_split(
            X_dense, y, test_size=0.2,
            stratify=y if y.nunique()==2 else None,
            random_state=RS
        )

        # Random Forest
        rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=RS)
        rf.fit(Xtr, ytr)
        p = rf.predict(Xte); pro = rf.predict_proba(Xte)[:,1] if y.nunique()==2 else None
        rf_acc = accuracy_score(yte,p); rf_f1 = f1_score(yte,p, average="binary" if y.nunique()==2 else "macro")
        rf_auc = safe_auc(yte, pro)

        # XGBoost (sklearn API; dense OK; you could also pass sparse)
        xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_jobs=-1, random_state=RS)
        xgb_model.fit(Xtr, ytr)
        p = xgb_model.predict(Xte); pro = xgb_model.predict_proba(Xte)[:,1] if y.nunique()==2 else None
        xgb_acc = accuracy_score(yte,p); xgb_f1 = f1_score(yte,p, average="binary" if y.nunique()==2 else "macro")
        xgb_auc = safe_auc(yte, pro)

        # Logistic Regression
        lr = LogisticRegression(max_iter=1000, n_jobs=-1)
        lr.fit(Xtr, ytr)
        p = lr.predict(Xte); pro = lr.predict_proba(Xte)[:,1] if y.nunique()==2 else None
        lr_acc = accuracy_score(yte,p); lr_f1 = f1_score(yte,p, average="binary" if y.nunique()==2 else "macro")
        lr_auc = safe_auc(yte, pro)

        results.append({
            "dataset": name,
            "RF_acc": rf_acc, "RF_f1": rf_f1, "RF_auc": rf_auc,
            "XGB_acc": xgb_acc, "XGB_f1": xgb_f1, "XGB_auc": xgb_auc,
            "LR_acc": lr_acc, "LR_f1": lr_f1, "LR_auc": lr_auc,
        })
        print(f"RF  acc={rf_acc:.3f} f1={rf_f1:.3f}" + (f" auc={rf_auc:.3f}" if rf_auc is not None else ""))
        print(f"XGB acc={xgb_acc:.3f} f1={xgb_f1:.3f}" + (f" auc={xgb_auc:.3f}" if xgb_auc is not None else ""))
        print(f"LR  acc={lr_acc:.3f} f1={lr_f1:.3f}" + (f" auc={lr_auc:.3f}" if lr_auc is not None else ""))

    else:
        # ---------- BIG (chunked) ----------
        # Fit preprocessor on small sample
        samp = pd.read_csv(name, nrows=SAMPLE, low_memory=False)
        Xs_raw, ys = split_Xy(samp, target, DROP_COLS.get(name, []), label_map)
        pre = build_preprocessor_streaming(Xs_raw)

        # Holdout = next SAMPLE rows (fallback to split if file too small)
        try:
            hold = pd.read_csv(name, skiprows=range(1, SAMPLE+1), nrows=SAMPLE, low_memory=False)
            Xh_raw, yh = split_Xy(hold, target, DROP_COLS.get(name, []), label_map)
            Xh = pre.transform(Xh_raw)   # sparse or dense; OK
        except Exception:
            base = pd.read_csv(name, nrows=2*SAMPLE, low_memory=False)
            Xb_raw, yb = split_Xy(base, target, DROP_COLS.get(name, []), label_map)
            Xtr_raw, Xh_raw, ytr_tmp, yh = train_test_split(
                Xb_raw, yb, test_size=0.3, stratify=yb if yb.nunique()==2 else None, random_state=RS
            )
            pre = build_preprocessor_streaming(Xtr_raw)
            Xh = pre.transform(Xh_raw)

        # Incremental models
        sgd = SGDClassifier(loss="log_loss", random_state=RS)
        pac = PassiveAggressiveClassifier(random_state=RS)

        # Warm-start to register classes
        sgd.partial_fit(Xh, yh, classes=np.array(list(label_map.values())))
        pac.partial_fit(Xh, yh, classes=np.array(list(label_map.values())))

        # Stream remaining rows safely (don’t skip past EOF)
        def safe_reader(csv_path, target_col, start_after, chunksize):
            total = count_rows(csv_path, target_col)
            start = min(start_after, max(total-1, 0))
            if start <= 0:
                return pd.read_csv(csv_path, chunksize=chunksize, low_memory=False)
            return pd.read_csv(csv_path, chunksize=chunksize, low_memory=False,
                               skiprows=range(1, start+1))

        reader = safe_reader(name, target, start_after=2*SAMPLE, chunksize=CHUNK)
        for ch in reader:
            if ch is None or len(ch) == 0:
                continue
            Xc_raw, yc = split_Xy(ch, target, DROP_COLS.get(name, []), label_map)
            if len(Xc_raw) == 0:
                continue
            Xc = pre.transform(Xc_raw)
            sgd.partial_fit(Xc, yc)
            pac.partial_fit(Xc, yc)

        # Evaluate on holdout
        p = sgd.predict(Xh)
        try:
            pro = sgd.predict_proba(Xh)[:, 1]
        except Exception:
            pro = 1 / (1 + np.exp(-sgd.decision_function(Xh)))
        sgd_acc = accuracy_score(yh, p)
        sgd_f1  = f1_score(yh, p, average="binary" if np.unique(yh).size==2 else "macro")
        sgd_auc = safe_auc(yh, pro)

        p2 = pac.predict(Xh)
        z2 = pac.decision_function(Xh)
        pro2 = 1 / (1 + np.exp(-z2)) if np.ndim(z2) == 1 else None
        pac_acc = accuracy_score(yh, p2)
        pac_f1  = f1_score(yh, p2, average="binary" if np.unique(yh).size==2 else "macro")
        pac_auc = safe_auc(yh, pro2) if pro2 is not None else None

        row = {
            "dataset": name,
            "SGD_acc": sgd_acc, "SGD_f1": sgd_f1, "SGD_auc": sgd_auc,
            "PAC_acc": pac_acc, "PAC_f1": pac_f1, "PAC_auc": pac_auc,
        }
        print(f"SGD acc={sgd_acc:.3f} f1={sgd_f1:.3f}" + (f" auc={sgd_auc:.3f}" if sgd_auc is not None else ""))
        print(f"PAC acc={pac_acc:.3f} f1={pac_f1:.3f}" + (f" auc={pac_auc:.3f}" if pac_auc is not None else ""))

        # Optional small XGBoost on capped rows (reuse same preprocessor)
        try:
            small = pd.read_csv(name, nrows=XGB_CAP, low_memory=False)
            Xsm_raw, ysm = split_Xy(small, target, DROP_COLS.get(name, []), label_map)
            Xsm = pre.transform(Xsm_raw)
            dtr = xgb.DMatrix(Xsm, label=ysm.values)   # accepts CSR/DOK/COO
            params = {"objective": "binary:logistic", "eval_metric": "auc", "seed": RS}
            # params["tree_method"] = "gpu_hist"  # uncomment if GPU available
            xgbm = xgb.train(params, dtr, num_boost_round=200)
            dho = xgb.DMatrix(Xh)
            pro_xgb = xgbm.predict(dho)
            pred_xgb = (pro_xgb >= 0.5).astype(int)
            row.update({
                "XGB_acc": accuracy_score(yh, pred_xgb),
                "XGB_f1":  f1_score(yh, pred_xgb, average="binary" if np.unique(yh).size==2 else "macro"),
                "XGB_auc": safe_auc(yh, pro_xgb),
            })
            print(f"XGB acc={row['XGB_acc']:.3f} f1={row['XGB_f1']:.3f}" + (f" auc={row['XGB_auc']:.3f}" if row['XGB_auc'] is not None else ""))
        except Exception as e:
            row.update({"XGB_acc": None, "XGB_f1": None, "XGB_auc": None})
            print(f"[XGB skipped] {e}")

        results.append(row)

# ---------------- Summary ----------------
summary = pd.DataFrame(results)
display(summary)



===== uci_android_permissions.csv.gz =====
Rows=29,332 → small (in-memory); classes=[0, 1]
RF  acc=0.970 f1=0.970 auc=0.993
XGB acc=0.967 f1=0.967 auc=0.994
LR  acc=0.957 f1=0.957 auc=0.988

===== uci_indian_liver.csv.gz =====
Rows=583 → small (in-memory); classes=[0, 1]
RF  acc=0.735 f1=0.832 auc=0.755
XGB acc=0.701 f1=0.807 auc=0.728
LR  acc=0.735 f1=0.836 auc=0.822

===== uci_mushroom.csv.gz =====
Rows=8,124 → small (in-memory); classes=['e', 'p']
RF  acc=1.000 f1=1.000 auc=1.000
XGB acc=1.000 f1=1.000 auc=1.000
LR  acc=0.999 f1=0.999 auc=1.000

===== uci_phishing_url.csv.gz =====
Rows=235,795 → BIG (chunked); classes=[0, 1]
SGD acc=0.965 f1=0.970 auc=0.960
PAC acc=0.994 f1=0.994 auc=1.000
XGB acc=1.000 f1=1.000 auc=1.000

===== uci_secondary_mushroom.csv.gz =====
Rows=61,069 → small (in-memory); classes=['e', 'p']
RF  acc=1.000 f1=1.000 auc=1.000
XGB acc=1.000 f1=1.000 auc=1.000
LR  acc=0.865 f1=0.878 auc=0.937


Unnamed: 0,dataset,RF_acc,RF_f1,RF_auc,XGB_acc,XGB_f1,XGB_auc,LR_acc,LR_f1,LR_auc,SGD_acc,SGD_f1,SGD_auc,PAC_acc,PAC_f1,PAC_auc
0,uci_android_permissions.csv.gz,0.969661,0.969604,0.992672,0.967275,0.967146,0.993537,0.957218,0.957407,0.988019,,,,,,
1,uci_indian_liver.csv.gz,0.735043,0.832432,0.754784,0.700855,0.80663,0.727853,0.735043,0.835979,0.821758,,,,,,
2,uci_mushroom.csv.gz,1.0,1.0,1.0,1.0,1.0,1.0,0.998769,0.998721,1.0,,,,,,
3,uci_phishing_url.csv.gz,,,,1.0,1.0,1.0,,,,0.9648,0.970129,0.959722,0.9936,0.994425,0.999529
4,uci_secondary_mushroom.csv.gz,1.0,1.0,1.0,0.999836,0.999852,1.0,0.864582,0.878239,0.936879,,,,,,


In [8]:
# Baselines (small) + CHUNKED training (big) using your `cleaned` DataFrames
!pip -q install xgboost

import numpy as np, pandas as pd, warnings
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier, PassiveAggressiveClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from scipy import sparse

warnings.filterwarnings("ignore")

# -------- config --------
RS = 42
ROW_THRESHOLD = 100_000      # only chunk if rows > this
BATCH_ROWS    = 50_000       # mini-batch size from CLEANED DFs
SAMPLE_ROWS   = 5_000        # rows to fit encoder for BIG datasets
XGB_CAP       = 80_000       # rows for optional XGB baseline on BIG sets

TARGETS = {
    "uci_android_permissions.csv.gz": "result",
    "uci_indian_liver.csv.gz":        "has_liver_disease",
    "uci_mushroom.csv.gz":            "poisonous",
    "uci_phishing_url.csv.gz":        "label",
    "uci_secondary_mushroom.csv.gz":  "class",
}
DROP_COLS = {
    "uci_android_permissions.csv.gz": [],
    "uci_indian_liver.csv.gz":        [],
    "uci_mushroom.csv.gz":            ["Unnamed: 0"],  # if present
    "uci_phishing_url.csv.gz":        ["url", "title"],# already cleaned? keep safe
    "uci_secondary_mushroom.csv.gz":  [],
}

def safe_auc(y_true, proba):
    if proba is None: return None
    y_true = pd.Series(y_true)
    return roc_auc_score(y_true, proba) if y_true.nunique()==2 else None

def split_Xy_cleaned(df, target, drop_cols=None):
    df = df.drop(columns=[c for c in (drop_cols or []) if c in df.columns], errors="ignore")
    y  = df[target]
    X  = df.drop(columns=[target])
    # map binary strings to 0/1 if needed
    if y.dtype == "object":
        uniq = sorted(y.dropna().unique().tolist())
        if len(uniq) == 2:
            y = y.map({uniq[0]:0, uniq[1]:1})
    return X, y.astype(int)

def build_preprocessor_sparse(X_sample):
    """Version-agnostic: impute numeric + OHE categoricals. Output can be sparse."""
    cat = [c for c in X_sample.columns if X_sample[c].dtype == "object"]
    num = [c for c in X_sample.columns if X_sample[c].dtype != "object"]
    pre = ColumnTransformer(
        transformers=[
            ("num_impute", SimpleImputer(strategy="median"), num),
            ("cat_ohe",    OneHotEncoder(handle_unknown="ignore"), cat),
        ],
        sparse_threshold=1.0
    )
    pre.fit(X_sample)
    return pre

def to_batches(n_rows, batch_size):
    # yields (start, end) ranges
    for s in range(0, n_rows, batch_size):
        e = min(s + batch_size, n_rows)
        yield s, e

results = []

for name, df in cleaned.items():
    if name not in TARGETS:
        print(f"[skip] {name} (no target config)")
        continue

    target   = TARGETS[name]
    dropcols = DROP_COLS.get(name, [])
    X_raw, y = split_Xy_cleaned(df, target, dropcols)
    n_rows   = len(X_raw)
    print(f"\n===== {name} =====")
    print(f"Rows={n_rows:,} → {'BIG (chunked)' if n_rows > ROW_THRESHOLD else 'small (in-memory)'}")

    # ---- SMALL: in-memory baselines ----
    if n_rows <= ROW_THRESHOLD:
        # one-hot all at once (data is already cleaned → no NaNs)
        X = pd.get_dummies(X_raw, drop_first=True)
        Xtr, Xte, ytr, yte = train_test_split(
            X.values, y, test_size=0.2, stratify=y if y.nunique()==2 else None, random_state=RS
        )

        # RF
        rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=RS)
        rf.fit(Xtr, ytr)
        p  = rf.predict(Xte)
        pr = rf.predict_proba(Xte)[:,1] if y.nunique()==2 else None
        rf_acc, rf_f1, rf_auc = accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pr)

        # XGB
        xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_jobs=-1, random_state=RS)
        xgb_clf.fit(Xtr, ytr)
        p  = xgb_clf.predict(Xte)
        pr = xgb_clf.predict_proba(Xte)[:,1] if y.nunique()==2 else None
        xgb_acc, xgb_f1, xgb_auc = accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pr)

        # LR
        lr = LogisticRegression(max_iter=1000, n_jobs=-1)
        lr.fit(Xtr, ytr)
        p  = lr.predict(Xte)
        pr = lr.predict_proba(Xte)[:,1] if y.nunique()==2 else None
        lr_acc, lr_f1, lr_auc = accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pr)

        results.append({
            "dataset": name,
            "RF_acc": rf_acc, "RF_f1": rf_f1, "RF_auc": rf_auc,
            "XGB_acc": xgb_acc, "XGB_f1": xgb_f1, "XGB_auc": xgb_auc,
            "LR_acc": lr_acc, "LR_f1": lr_f1, "LR_auc": lr_auc,
        })
        print(f"RF  acc={rf_acc:.3f} f1={rf_f1:.3f}" + (f" auc={rf_auc:.3f}" if rf_auc is not None else ""))
        print(f"XGB acc={xgb_acc:.3f} f1={xgb_f1:.3f}" + (f" auc={xgb_auc:.3f}" if xgb_auc is not None else ""))
        print(f"LR  acc={lr_acc:.3f} f1={lr_f1:.3f}" + (f" auc={lr_auc:.3f}" if lr_auc is not None else ""))
        continue

    # ---- BIG: chunk the CLEANED DF ----
    # 1) Fit sparse preprocessor on a small sample (from cleaned)
    samp = X_raw.iloc[:min(SAMPLE_ROWS, n_rows)].copy()
    pre  = build_preprocessor_sparse(samp)

    # 2) Create a small holdout from next block (or 20% split if not enough)
    hold_start = min(SAMPLE_ROWS, n_rows)
    hold_end   = min(hold_start + SAMPLE_ROWS, n_rows)
    if hold_end - hold_start < 200:  # fallback to split
        Xtr_raw, Xho_raw, ytr, yho = train_test_split(
            X_raw, y, test_size=0.2, stratify=y if y.nunique()==2 else None, random_state=RS
        )
        Xho = pre.transform(Xho_raw)
    else:
        Xho_raw = X_raw.iloc[hold_start:hold_end]
        yho     = y.iloc[hold_start:hold_end]
        Xtr_raw = pd.concat([X_raw.iloc[:hold_start], X_raw.iloc[hold_end:]], axis=0)
        ytr     = pd.concat([y.iloc[:hold_start], y.iloc[hold_end:]], axis=0)
        Xho     = pre.transform(Xho_raw)

    # 3) Incremental models (sparse-friendly)
    sgd = SGDClassifier(loss="log_loss", random_state=RS)
    pac = PassiveAggressiveClassifier(random_state=RS)

    # Warm-start to register classes
    sgd.partial_fit(Xho, yho, classes=np.array(sorted(y.unique())))
    pac.partial_fit(Xho, yho, classes=np.array(sorted(y.unique())))

    # 4) Stream training set in mini-batches from the CLEANED DF
    idx = np.arange(len(Xtr_raw))
    for s, e in to_batches(len(idx), BATCH_ROWS):
        rows = idx[s:e]
        Xb   = pre.transform(Xtr_raw.iloc[rows])
        yb   = ytr.iloc[rows]
        if len(yb) == 0:
            continue
        sgd.partial_fit(Xb, yb)
        pac.partial_fit(Xb, yb)

    # 5) Evaluate on holdout
    # SGD
    p  = sgd.predict(Xho)
    try: pro = sgd.predict_proba(Xho)[:,1]
    except: pro = 1/(1+np.exp(-sgd.decision_function(Xho)))
    sgd_acc = accuracy_score(yho,p); sgd_f1 = f1_score(yho,p, average="binary" if yho.nunique()==2 else "macro"); sgd_auc = safe_auc(yho, pro)

    # PAC
    p2 = pac.predict(Xho)
    z2 = pac.decision_function(Xho); pro2 = 1/(1+np.exp(-z2)) if np.ndim(z2)==1 else None
    pac_acc = accuracy_score(yho,p2); pac_f1 = f1_score(yho,p2, average="binary" if yho.nunique()==2 else "macro"); pac_auc = safe_auc(yho, pro2) if pro2 is not None else None

    row = {"dataset": name, "SGD_acc": sgd_acc, "SGD_f1": sgd_f1, "SGD_auc": sgd_auc,
           "PAC_acc": pac_acc, "PAC_f1": pac_f1, "PAC_auc": pac_auc}

    # 6) Optional: small XGBoost baseline on a cap from TRAIN (still from cleaned)
    cap_n = min(XGB_CAP, len(Xtr_raw))
    try:
        Xcap  = pre.transform(Xtr_raw.iloc[:cap_n])
        ycap  = ytr.iloc[:cap_n]
        dtr   = xgb.DMatrix(Xcap, label=ycap.values)  # accepts CSR/COO
        params = {"objective":"binary:logistic","eval_metric":"auc","seed":RS}
        # params["tree_method"] = "gpu_hist"  # if GPU is available
        xgbm = xgb.train(params, dtr, num_boost_round=200)

        dho   = xgb.DMatrix(Xho)
        pro_x = xgbm.predict(dho)
        predx = (pro_x >= 0.5).astype(int)

        row.update({
            "XGB_acc": accuracy_score(yho, predx),
            "XGB_f1":  f1_score(yho, predx, average="binary" if yho.nunique()==2 else "macro"),
            "XGB_auc": safe_auc(yho, pro_x),
        })
    except Exception as e:
        row.update({"XGB_acc": None, "XGB_f1": None, "XGB_auc": None})
        print(f"[XGB skipped] {e}")

    results.append(row)

    print(f"SGD acc={sgd_acc:.3f} f1={sgd_f1:.3f}" + (f" auc={sgd_auc:.3f}" if sgd_auc is not None else ""))
    print(f"PAC acc={pac_acc:.3f} f1={pac_f1:.3f}" + (f" auc={pac_auc:.3f}" if pac_auc is not None else ""))
    if row.get("XGB_acc") is not None:
        print(f"XGB acc={row['XGB_acc']:.3f} f1={row['XGB_f1']:.3f}" + (f" auc={row['XGB_auc']:.3f}" if row['XGB_auc'] is not None else ""))

# Summary
pd.DataFrame(results)



===== uci_android_permissions.csv.gz =====
Rows=29,332 → small (in-memory)
RF  acc=0.970 f1=0.970 auc=0.993
XGB acc=0.967 f1=0.967 auc=0.994
LR  acc=0.957 f1=0.957 auc=0.988

===== uci_indian_liver.csv.gz =====
Rows=583 → small (in-memory)
RF  acc=0.744 f1=0.835 auc=0.760
XGB acc=0.701 f1=0.807 auc=0.728
LR  acc=0.735 f1=0.836 auc=0.823

===== uci_mushroom.csv.gz =====
Rows=8,124 → small (in-memory)
RF  acc=1.000 f1=1.000 auc=1.000
XGB acc=1.000 f1=1.000 auc=1.000
LR  acc=0.999 f1=0.999 auc=1.000

===== uci_phishing_url.csv.gz =====
Rows=235,795 → BIG (chunked)
SGD acc=0.968 f1=0.973 auc=0.962
PAC acc=0.979 f1=0.982 auc=0.991
XGB acc=1.000 f1=1.000 auc=1.000

===== uci_secondary_mushroom.csv.gz =====
Rows=61,069 → small (in-memory)
RF  acc=1.000 f1=1.000 auc=1.000
XGB acc=1.000 f1=1.000 auc=1.000
LR  acc=0.815 f1=0.832 auc=0.883


Unnamed: 0,dataset,RF_acc,RF_f1,RF_auc,XGB_acc,XGB_f1,XGB_auc,LR_acc,LR_f1,LR_auc,SGD_acc,SGD_f1,SGD_auc,PAC_acc,PAC_f1,PAC_auc
0,uci_android_permissions.csv.gz,0.969661,0.969604,0.992672,0.967275,0.967146,0.993537,0.957218,0.957407,0.988019,,,,,,
1,uci_indian_liver.csv.gz,0.74359,0.835165,0.760454,0.700855,0.80663,0.727853,0.735043,0.835979,0.823175,,,,,,
2,uci_mushroom.csv.gz,1.0,1.0,1.0,1.0,1.0,1.0,0.998769,0.998721,1.0,,,,,,
3,uci_phishing_url.csv.gz,,,,1.0,1.0,1.0,,,,0.9678,0.972735,0.962391,0.9788,0.981824,0.990679
4,uci_secondary_mushroom.csv.gz,1.0,1.0,1.0,0.999754,0.999779,1.0,0.815294,0.831667,0.883074,,,,,,


In [10]:
# --- Part 1: setup / helpers ---

!pip -q install xgboost

import numpy as np, pandas as pd, warnings
from itertools import product
from scipy import sparse

from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, make_scorer

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier, PassiveAggressiveClassifier

import xgboost as xgb
warnings.filterwarnings("ignore")

# -------- Global config --------
RS = 42
ROW_THRESHOLD = 100_000    # only chunk if rows > this
BATCH_ROWS    = 50_000     # mini-batch size for big datasets (from CLEANED)
SAMPLE_ROWS   = 5_000      # rows to fit encoder/holdout on big datasets
XGB_CAP       = 80_000     # cap rows for optional XGB on big datasets
EPOCHS_BIG    = 3          # number of passes over big-data batches (tuning)

TARGETS = {
    "uci_android_permissions.csv.gz": "result",
    "uci_indian_liver.csv.gz":        "has_liver_disease",
    "uci_mushroom.csv.gz":            "poisonous",
    "uci_phishing_url.csv.gz":        "label",
    "uci_secondary_mushroom.csv.gz":  "class",
}
DROP_COLS = {
    "uci_android_permissions.csv.gz": [],
    "uci_indian_liver.csv.gz":        [],
    "uci_mushroom.csv.gz":            ["Unnamed: 0"],  # if present
    "uci_phishing_url.csv.gz":        ["url", "title"],
    "uci_secondary_mushroom.csv.gz":  [],
}

# -------- Utilities --------
def safe_auc(y_true, proba):
    if proba is None: return None
    y_true = pd.Series(y_true)
    return roc_auc_score(y_true, proba) if y_true.nunique()==2 else None

def split_Xy_cleaned(df, target, drop_cols=None):
    """Use already-cleaned DF: drop cols, map binary strings to 0/1, return X, y(int)."""
    df = df.drop(columns=[c for c in (drop_cols or []) if c in df.columns], errors="ignore")
    y  = df[target]
    X  = df.drop(columns=[target])
    if y.dtype == "object":
        uniq = sorted(y.dropna().unique().tolist())
        if len(uniq) == 2:
            y = y.map({uniq[0]:0, uniq[1]:1})
    return X, y.astype(int)

def to_batches(n_rows, batch_size):
    for s in range(0, n_rows, batch_size):
        e = min(s + batch_size, n_rows)
        yield s, e

def build_preprocessor_sparse(X_sample):
    """Version-agnostic: impute numeric + OHE categoricals. Output can be sparse."""
    cat = [c for c in X_sample.columns if X_sample[c].dtype == "object"]
    num = [c for c in X_sample.columns if X_sample[c].dtype != "object"]
    pre = ColumnTransformer(
        transformers=[
            ("num_impute", SimpleImputer(strategy="median"), num),
            ("cat_ohe",    OneHotEncoder(handle_unknown="ignore"), cat),
        ],
        sparse_threshold=1.0
    )
    pre.fit(X_sample)
    return pre


In [11]:
# --- Part 2: Small datasets (<= ROW_THRESHOLD) ---

def baseline_small_models(X, y):
    """Return (splits, baselines dict) for RF, XGB, LR on in-memory small dataset."""
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=0.2, stratify=y if y.nunique()==2 else None, random_state=RS
    )
    # RF baseline
    rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=RS)
    rf.fit(Xtr, ytr)
    p=rf.predict(Xte); pro=rf.predict_proba(Xte)[:,1] if y.nunique()==2 else None
    rf_base = (accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pro))

    # XGB baseline
    xgbc = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_jobs=-1, random_state=RS)
    xgbc.fit(Xtr, ytr)
    p=xgbc.predict(Xte); pro=xgbc.predict_proba(Xte)[:,1] if y.nunique()==2 else None
    xgb_base = (accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pro))

    # LR baseline
    lr = LogisticRegression(max_iter=1000, n_jobs=-1)
    lr.fit(Xtr, ytr)
    p=lr.predict(Xte); pro=lr.predict_proba(Xte)[:,1] if y.nunique()==2 else None
    lr_base = (accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pro))

    return (Xtr, Xte, ytr, yte), {"rf": (rf, rf_base), "xgb": (xgbc, xgb_base), "lr": (lr, lr_base)}

def tune_small_models(X, y):
    """Randomized/Small grid search for RF, XGB, LR (fast-ish)."""
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=0.2, stratify=y if y.nunique()==2 else None, random_state=RS
    )
    scorer = make_scorer(f1_score, average="binary" if y.nunique()==2 else "macro")

    # RF RandomizedSearch
    rf = RandomForestClassifier(random_state=RS, n_jobs=-1)
    rf_param = {
        "n_estimators": [200, 400, 600],
        "max_depth": [None, 10, 20, 40],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4],
        "max_features": ["sqrt", "log2", None],
    }
    rf_rs = RandomizedSearchCV(rf, rf_param, n_iter=15, scoring=scorer, cv=5, random_state=RS, n_jobs=-1, verbose=0)
    rf_rs.fit(Xtr, ytr)
    rf_best = rf_rs.best_estimator_
    p=rf_best.predict(Xte); pro=rf_best.predict_proba(Xte)[:,1] if y.nunique()==2 else None
    rf_tuned = (accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pro))

    # XGB RandomizedSearch
    xgbc = xgb.XGBClassifier(eval_metric="logloss", n_jobs=-1, random_state=RS, use_label_encoder=False)
    xgb_param = {
        "n_estimators": [300, 500, 800],
        "max_depth": [3, 5, 7, 9],
        "learning_rate": [0.05, 0.1, 0.2],
        "subsample": [0.7, 0.85, 1.0],
        "colsample_bytree": [0.7, 0.85, 1.0],
        "reg_alpha": [0, 0.5, 1.0],
        "reg_lambda": [0.5, 1.0, 2.0],
    }
    xgb_rs = RandomizedSearchCV(xgbc, xgb_param, n_iter=20, scoring=scorer, cv=5, random_state=RS, n_jobs=-1, verbose=0)
    xgb_rs.fit(Xtr, ytr)
    xgb_best = xgb_rs.best_estimator_
    p=xgb_best.predict(Xte); pro=xgb_best.predict_proba(Xte)[:,1] if y.nunique()==2 else None
    xgb_tuned = (accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pro))

    # Logistic Regression (small grid)
    lr = LogisticRegression(max_iter=3000, n_jobs=-1, solver="saga")
    lr_param = {
        "penalty": ["l2", "l1"],
        "C": [0.1, 1.0, 3.0, 10.0],
    }
    lr_gs = GridSearchCV(lr, lr_param, scoring=scorer, cv=5, n_jobs=-1, verbose=0)
    lr_gs.fit(Xtr, ytr)
    lr_best = lr_gs.best_estimator_
    p=lr_best.predict(Xte); pro=lr_best.predict_proba(Xte)[:,1] if y.nunique()==2 else None
    lr_tuned = (accuracy_score(yte,p), f1_score(yte,p, average="binary" if y.nunique()==2 else "macro"), safe_auc(yte, pro))

    return {"rf": (rf_best, rf_tuned), "xgb": (xgb_best, xgb_tuned), "lr": (lr_best, lr_tuned)}


In [12]:
# --- Part 3: Big datasets (chunked with epochs) + Main runner ---

def chunked_train_with_tuning(X_raw, y,
                              sample_rows=SAMPLE_ROWS,
                              batch_rows=BATCH_ROWS,
                              epochs=EPOCHS_BIG):
    """Incremental training with epochs on big data + tiny param search for SGD/PAC."""
    n = len(X_raw)
    # 1) fit sparse preprocessor on a small sample
    samp = X_raw.iloc[:min(sample_rows, n)].copy()
    pre  = build_preprocessor_sparse(samp)

    # 2) build holdout and training pools
    hold_start = min(sample_rows, n)
    hold_end   = min(hold_start + sample_rows, n)
    if hold_end - hold_start < 200:
        Xtr_raw, Xho_raw, ytr, yho = train_test_split(
            X_raw, y, test_size=0.2, stratify=y if y.nunique()==2 else None, random_state=RS
        )
        Xho = pre.transform(Xho_raw)
    else:
        Xho_raw = X_raw.iloc[hold_start:hold_end]
        yho     = y.iloc[hold_start:hold_end]
        Xtr_raw = pd.concat([X_raw.iloc[:hold_start], X_raw.iloc[hold_end:]], axis=0)
        ytr     = pd.concat([y.iloc[:hold_start], y.iloc[hold_end:]], axis=0)
        Xho     = pre.transform(Xho_raw)

    idx = np.arange(len(Xtr_raw))

    # 3) Baseline (single pass)
    sgd_base = SGDClassifier(loss="log_loss", random_state=RS)
    pac_base = PassiveAggressiveClassifier(random_state=RS)
    # warm-start to register classes
    sgd_base.partial_fit(Xho, yho, classes=np.array(sorted(y.unique())))
    pac_base.partial_fit(Xho, yho, classes=np.array(sorted(y.unique())))
    for s, e in to_batches(len(idx), batch_rows):
        rows = idx[s:e]
        Xb   = pre.transform(Xtr_raw.iloc[rows]); yb = ytr.iloc[rows]
        sgd_base.partial_fit(Xb, yb)
        pac_base.partial_fit(Xb, yb)
    # evaluate baseline
    pred = sgd_base.predict(Xho)
    try: pro = sgd_base.predict_proba(Xho)[:,1]
    except: pro = 1/(1+np.exp(-sgd_base.decision_function(Xho)))
    sgd_base_metrics = (accuracy_score(yho,pred), f1_score(yho,pred, average="binary" if yho.nunique()==2 else "macro"), safe_auc(yho, pro))

    pred = pac_base.predict(Xho)
    z    = pac_base.decision_function(Xho); pro2 = 1/(1+np.exp(-z)) if np.ndim(z)==1 else None
    pac_base_metrics = (accuracy_score(yho,pred), f1_score(yho,pred, average="binary" if yho.nunique()==2 else "macro"), safe_auc(yho, pro2))

    # 4) Tiny param grids (with SGD eta0 fix)
    sgd_grid = {
        "alpha": [1e-5, 1e-4, 1e-3],
        "learning_rate": ["optimal", "adaptive"],  # 'adaptive' needs eta0>0
        "eta0": [0.01, 0.1],                       # used when learning_rate != 'optimal'
        "loss": ["log_loss"],
    }
    pac_grid = {
        "C": [0.5, 1.0, 2.0],
        "loss": ["hinge", "squared_hinge"],
        "average": [False, True],
    }

    def eval_sgd(params):
        lr_mode = params.get("learning_rate", "optimal")
        eta0 = params.get("eta0", 0.01 if lr_mode != "optimal" else 0.0)
        clf = SGDClassifier(random_state=RS,
                            loss=params["loss"],
                            alpha=params["alpha"],
                            learning_rate=lr_mode,
                            eta0=eta0)
        clf.partial_fit(Xho, yho, classes=np.array(sorted(y.unique())))
        for epoch in range(epochs):
            np.random.RandomState(RS+epoch).shuffle(idx)
            for s, e in to_batches(len(idx), batch_rows):
                rows = idx[s:e]
                Xb   = pre.transform(Xtr_raw.iloc[rows]); yb = ytr.iloc[rows]
                clf.partial_fit(Xb, yb)
        pred = clf.predict(Xho)
        try: pro = clf.predict_proba(Xho)[:, 1]
        except: pro = 1/(1+np.exp(-clf.decision_function(Xho)))
        return (accuracy_score(yho,pred),
                f1_score(yho,pred, average="binary" if yho.nunique()==2 else "macro"),
                safe_auc(yho, pro)), clf

    def eval_pac(params):
        clf = PassiveAggressiveClassifier(random_state=RS, **params)
        clf.partial_fit(Xho, yho, classes=np.array(sorted(y.unique())))
        for epoch in range(epochs):
            np.random.RandomState(RS+epoch).shuffle(idx)
            for s, e in to_batches(len(idx), batch_rows):
                rows = idx[s:e]
                Xb   = pre.transform(Xtr_raw.iloc[rows]); yb = ytr.iloc[rows]
                clf.partial_fit(Xb, yb)
        pred = clf.predict(Xho)
        z    = clf.decision_function(Xho); pro = 1/(1+np.exp(-z)) if np.ndim(z)==1 else None
        return (accuracy_score(yho,pred),
                f1_score(yho,pred, average="binary" if yho.nunique()==2 else "macro"),
                safe_auc(yho, pro)), clf

    # 5) Search grids (very small)
    best_sgd = None; best_sgd_m = (-1,-1,None)
    for params in (dict(zip(sgd_grid.keys(), v)) for v in product(*sgd_grid.values())):
        m, clf = eval_sgd(params)
        if m[1] > best_sgd_m[1]: best_sgd, best_sgd_m = clf, m

    best_pac = None; best_pac_m = (-1,-1,None)
    for params in (dict(zip(pac_grid.keys(), v)) for v in product(*pac_grid.values())):
        m, clf = eval_pac(params)
        if m[1] > best_pac_m[1]: best_pac, best_pac_m = clf, m

    return {
        "pre": pre,
        "baseline": {"sgd": (sgd_base, sgd_base_metrics), "pac": (pac_base, pac_base_metrics)},
        "tuned":    {"sgd": (best_sgd, best_sgd_m),       "pac": (best_pac, best_pac_m)},
        "holdout_y": yho
    }

# -------- Main runner (uses `cleaned` dict already in memory) --------
summary_rows = []

for name, df in cleaned.items():
    if name not in TARGETS:
        print(f"[skip] {name} (no target configured)")
        continue

    print(f"\n===== {name} =====")
    target = TARGETS[name]
    X_raw, y = split_Xy_cleaned(df, target, DROP_COLS.get(name, []))
    n_rows = len(X_raw)

    if n_rows <= ROW_THRESHOLD:
        # one-hot all at once (already cleaned → no impute needed)
        X = pd.get_dummies(X_raw, drop_first=True)

        # Baselines
        (Xtr, Xte, ytr, yte), base = baseline_small_models(X, y)
        rf_base = base["rf"][1]; xgb_base = base["xgb"][1]; lr_base = base["lr"][1]
        print(f"Baseline RF  → acc={rf_base[0]:.3f} f1={rf_base[1]:.3f}" + (f" auc={rf_base[2]:.3f}" if rf_base[2] is not None else ""))
        print(f"Baseline XGB → acc={xgb_base[0]:.3f} f1={xgb_base[1]:.3f}" + (f" auc={xgb_base[2]:.3f}" if xgb_base[2] is not None else ""))
        print(f"Baseline LR  → acc={lr_base[0]:.3f} f1={lr_base[1]:.3f}" + (f" auc={lr_base[2]:.3f}" if lr_base[2] is not None else ""))

        # Tuning
        tuned = tune_small_models(X, y)
        rf_t = tuned["rf"][1]; xgb_t = tuned["xgb"][1]; lr_t = tuned["lr"][1]
        print(f"Tuned   RF  → acc={rf_t[0]:.3f} f1={rf_t[1]:.3f}" + (f" auc={rf_t[2]:.3f}" if rf_t[2] is not None else ""))
        print(f"Tuned   XGB → acc={xgb_t[0]:.3f} f1={xgb_t[1]:.3f}" + (f" auc={xgb_t[2]:.3f}" if xgb_t[2] is not None else ""))
        print(f"Tuned   LR  → acc={lr_t[0]:.3f} f1={lr_t[1]:.3f}" + (f" auc={lr_t[2]:.3f}" if lr_t[2] is not None else ""))

        summary_rows.append({
            "dataset": name,
            "RF_base_acc": rf_base[0], "RF_tuned_acc": rf_t[0],
            "XGB_base_acc": xgb_base[0], "XGB_tuned_acc": xgb_t[0],
            "LR_base_acc": lr_base[0], "LR_tuned_acc": lr_t[0],
            "RF_base_f1": rf_base[1], "RF_tuned_f1": rf_t[1],
            "XGB_base_f1": xgb_base[1], "XGB_tuned_f1": xgb_t[1],
            "LR_base_f1": lr_base[1], "LR_tuned_f1": lr_t[1],
            "RF_base_auc": rf_base[2], "RF_tuned_auc": rf_t[2],
            "XGB_base_auc": xgb_base[2], "XGB_tuned_auc": xgb_t[2],
            "LR_base_auc": lr_base[2], "LR_tuned_auc": lr_t[2],
        })

    else:
        # BIG (chunked) with epochs + tiny param search (with eta0 fix)
        res = chunked_train_with_tuning(X_raw, y)
        yho = res["holdout_y"]
        sgd_b = res["baseline"]["sgd"][1]; pac_b = res["baseline"]["pac"][1]
        sgd_t = res["tuned"]["sgd"][1];   pac_t = res["tuned"]["pac"][1]

        print(f"Baseline SGD → acc={sgd_b[0]:.3f} f1={sgd_b[1]:.3f}" + (f" auc={sgd_b[2]:.3f}" if sgd_b[2] is not None else ""))
        print(f"Tuned    SGD → acc={sgd_t[0]:.3f} f1={sgd_t[1]:.3f}" + (f" auc={sgd_t[2]:.3f}" if sgd_t[2] is not None else ""))
        print(f"Baseline PAC → acc={pac_b[0]:.3f} f1={pac_b[1]:.3f}" + (f" auc={pac_b[2]:.3f}" if pac_b[2] is not None else ""))
        print(f"Tuned    PAC → acc={pac_t[0]:.3f} f1={pac_t[1]:.3f}" + (f" auc={pac_t[2]:.3f}" if pac_t[2] is not None else ""))

        summary_rows.append({
            "dataset": name,
            "SGD_base_acc": sgd_b[0], "SGD_tuned_acc": sgd_t[0],
            "PAC_base_acc": pac_b[0], "PAC_tuned_acc": pac_t[0],
            "SGD_base_f1": sgd_b[1], "SGD_tuned_f1": sgd_t[1],
            "PAC_base_f1": pac_b[1], "PAC_tuned_f1": pac_t[1],
            "SGD_base_auc": sgd_b[2], "SGD_tuned_auc": sgd_t[2],
            "PAC_base_auc": pac_b[2], "PAC_tuned_auc": pac_t[2],
        })

# Final summary (baseline vs tuned)
summary_df = pd.DataFrame(summary_rows)
print("\n=== Tuning Summary (baseline vs tuned) ===")
display(summary_df)



===== uci_android_permissions.csv.gz =====
Baseline RF  → acc=0.970 f1=0.970 auc=0.993
Baseline XGB → acc=0.967 f1=0.967 auc=0.994
Baseline LR  → acc=0.957 f1=0.957 auc=0.988
Tuned   RF  → acc=0.970 f1=0.970 auc=0.993
Tuned   XGB → acc=0.968 f1=0.968 auc=0.994
Tuned   LR  → acc=0.957 f1=0.957 auc=0.988

===== uci_indian_liver.csv.gz =====
Baseline RF  → acc=0.744 f1=0.835 auc=0.760
Baseline XGB → acc=0.701 f1=0.807 auc=0.728
Baseline LR  → acc=0.735 f1=0.836 auc=0.823
Tuned   RF  → acc=0.709 f1=0.819 auc=0.769
Tuned   XGB → acc=0.718 f1=0.818 auc=0.735
Tuned   LR  → acc=0.709 f1=0.830 auc=0.824

===== uci_mushroom.csv.gz =====
Baseline RF  → acc=1.000 f1=1.000 auc=1.000
Baseline XGB → acc=1.000 f1=1.000 auc=1.000
Baseline LR  → acc=0.999 f1=0.999 auc=1.000
Tuned   RF  → acc=1.000 f1=1.000 auc=1.000
Tuned   XGB → acc=1.000 f1=1.000 auc=1.000
Tuned   LR  → acc=1.000 f1=1.000 auc=1.000

===== uci_phishing_url.csv.gz =====
Baseline SGD → acc=0.968 f1=0.973 auc=0.962
Tuned    SGD → acc=0.9

Unnamed: 0,dataset,RF_base_acc,RF_tuned_acc,XGB_base_acc,XGB_tuned_acc,LR_base_acc,LR_tuned_acc,RF_base_f1,RF_tuned_f1,XGB_base_f1,...,PAC_base_acc,PAC_tuned_acc,SGD_base_f1,SGD_tuned_f1,PAC_base_f1,PAC_tuned_f1,SGD_base_auc,SGD_tuned_auc,PAC_base_auc,PAC_tuned_auc
0,uci_android_permissions.csv.gz,0.969661,0.969831,0.967275,0.968468,0.957218,0.957218,0.969604,0.96979,0.967146,...,,,,,,,,,,
1,uci_indian_liver.csv.gz,0.74359,0.709402,0.700855,0.717949,0.735043,0.709402,0.835165,0.819149,0.80663,...,,,,,,,,,,
2,uci_mushroom.csv.gz,1.0,1.0,1.0,1.0,0.998769,1.0,1.0,1.0,1.0,...,,,,,,,,,,
3,uci_phishing_url.csv.gz,,,,,,,,,,...,0.9788,0.9974,0.972735,0.987293,0.981824,0.99774,0.962391,0.985509,0.990679,0.999913
4,uci_secondary_mushroom.csv.gz,1.0,0.999918,0.999754,0.999918,0.815294,0.814966,1.0,0.999926,0.999779,...,,,,,,,,,,
