In [1]:
# Step 1: Load & inspect Parquet dataset (Colab)
# Run in Google Colab.

# 1) Ensure parquet support
!pip install -q pyarrow

import os
import pandas as pd
from google.colab import files, drive
from IPython.display import display

# ---------- USER ACTION: choose one of these ways to provide the file ----------
# Option A — upload from your local machine (uncomment to use)
# uploaded = files.upload()                      # a file picker will appear
# file_path = list(uploaded.keys())[0]

# Option B — mount Google Drive and use path on Drive (uncomment to use)
# drive.mount('/content/drive')
# file_path = '/content/drive/MyDrive/path/to/your_dataset.parquet'

# Option C — if you already uploaded the file earlier or it is in working dir:
# file_path = 'your_dataset.parquet'

# Set file_path variable now (edit as needed)
file_path = '/content/drive/MyDrive/Dataset/BaitBuster-Bangla_253070_18c_HL10k_AIL.parquet'   # <- CHANGE this to your filename / path

# ---------------------------------------------------------------------------
# Try to load the parquet file
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at: {file_path}. \
Either upload the file (files.upload()) or mount Drive and set file_path correctly.")

df = pd.read_parquet(file_path)
print(f"Loaded DataFrame: shape = {df.shape}\n")

# Quick peek
print("Columns:")
print(df.columns.tolist())
print("\nFirst 5 rows:")
display(df.head())

# Which expected text & label columns are present?
text_cols = ['title', 'title_debiased', 'description', 'description_debiased']
label_cols = ['auto_labeled', 'human_labeled', 'ai_labeled']

present_text = [c for c in text_cols if c in df.columns]
present_labels = [c for c in label_cols if c in df.columns]

print(f"\nText columns present: {present_text}")
print(f"Label columns present: {present_labels}\n")

# Basic null / length stats for text columns
print("Text columns: non-null count and sample lengths (first 3 non-null):")
for c in present_text:
    non_null = df[c].notna().sum()
    sample_lens = df[c].dropna().astype(str).map(len).sort_values().unique()[:3].tolist()
    print(f"  - {c}: {non_null} non-null, sample length values (smallest) {sample_lens}")

# Inspect each label column: dtype, unique values, counts
def inspect_label(col):
    print(f"\n--- Inspecting label column: {col} ---")
    ser = df[col]
    print("dtype:", ser.dtype)
    # Show up to first 20 unique values
    uniques = pd.Series(ser.dropna().unique()).head(20).tolist()
    print("sample unique values (up to 20):", uniques)
    try:
        print("value counts (top 10):")
        print(ser.value_counts(dropna=False).head(10))
    except Exception as e:
        print("Could not compute value_counts:", e)
    if pd.api.types.is_numeric_dtype(ser):
        print("numeric summary:")
        print(ser.describe())

for c in present_labels:
    inspect_label(c)

# If multiple label columns present, show cross-tab / agreement overview
if len(present_labels) >= 2:
    print("\nLabel cross-tabs (pairwise):")
    for i in range(len(present_labels)):
        for j in range(i+1, len(present_labels)):
            a = present_labels[i]; b = present_labels[j]
            print(f"\nCross-tab: {a} vs {b}")
            try:
                print(pd.crosstab(df[a], df[b], margins=True))
            except Exception as e:
                print("Could not produce crosstab:", e)

# Create a small sample CSV (optional) so you can attach it quickly if needed
sample_csv = "sample_first_200.csv"
df.head(200).to_csv(sample_csv, index=False)
print(f"\nSaved first 200 rows to {sample_csv} (you can download from Colab sidebar).")

print("\n\n===== DONE: paste the printed outputs (or upload the sample CSV) and I'll provide the next step =====")


Loaded DataFrame: shape = (253070, 18)

Columns:
['channel_id', 'channel_name', 'channel_url', 'video_id', 'publishedAt', 'title', 'title_debiased', 'description', 'description_debiased', 'url', 'viewCount', 'commentCount', 'likeCount', 'dislikeCount', 'thumbnail', 'auto_labeled', 'human_labeled', 'ai_labeled']

First 5 rows:


Unnamed: 0,channel_id,channel_name,channel_url,video_id,publishedAt,title,title_debiased,description,description_debiased,url,viewCount,commentCount,likeCount,dislikeCount,thumbnail,auto_labeled,human_labeled,ai_labeled
0,UCw4gfo5oaGPkHwarenuewAg,Ruposhi Bangla Tv,https://www.youtube.com/c/RuposhiBanglaTvtopvi...,J9xErXLh3bo,2021-08-17T08:59:13Z,এইমাত্র! মসজিদে নামাজরত অবস্থায় তিন বৃদ্ধকে পি...,এইমাত্র! মসজিদে নামাজরত অবস্থায় তিন বৃদ্ধকে পি...,ভিডিওটি ভাল লাগলে লাইক দিন \r\nও সবাইকে দেখার ...,ভিডিওটি লাগলে লাইক দিন ও সবাইকে দেখার সুযোগ কর...,https://www.youtube.com/watch?v=J9xErXLh3bo,12743,45,536,35,https://i.ytimg.com/vi/J9xErXLh3bo/default.jpg,Clickbait,Clickbait,Clickbait
1,UCw4gfo5oaGPkHwarenuewAg,Ruposhi Bangla Tv,https://www.youtube.com/c/RuposhiBanglaTvtopvi...,HPa6mRwjUg8,2021-08-17T04:19:08Z,"১০ বছরের সন্তান ফেলে আ,লীগ নেতার সাথে পালিয়ে গ...","১০ বছরের সন্তান ফেলে আ,লীগ নেতার সাথে পালিয়ে গ...",ভিডিওটি ভাল লাগলে লাইক দিন \r\nও সবাইকে দেখার ...,ভিডিওটি লাগলে লাইক দিন ও সবাইকে দেখার সুযোগ কর...,https://www.youtube.com/watch?v=HPa6mRwjUg8,22440,10,362,20,https://i.ytimg.com/vi/HPa6mRwjUg8/default.jpg,Clickbait,Clickbait,Clickbait
2,UCw4gfo5oaGPkHwarenuewAg,Ruposhi Bangla Tv,https://www.youtube.com/c/RuposhiBanglaTvtopvi...,bwkR5p0VY7Y,2021-08-16T15:18:16Z,এই মাত্র পাওয়া খবর! ৫ বছরের জেল হচ্ছে পরীমনির!...,এই মাত্র পাওয়া খবর! ৫ বছরের জেল হচ্ছে পরীমনির!...,ভিডিওটি ভাল লাগলে লাইক দিন \r\nও সবাইকে দেখার ...,ভিডিওটি লাগলে লাইক দিন ও সবাইকে দেখার সুযোগ কর...,https://www.youtube.com/watch?v=bwkR5p0VY7Y,46416,53,677,57,https://i.ytimg.com/vi/bwkR5p0VY7Y/default.jpg,Clickbait,Clickbait,Clickbait
3,UCw4gfo5oaGPkHwarenuewAg,Ruposhi Bangla Tv,https://www.youtube.com/c/RuposhiBanglaTvtopvi...,rwxsUpXAozk,2021-08-16T08:11:47Z,ছি ছি! ভাগিনার সাথে পরকীয়ার সময় হাতেনাতে ধরা খ...,ছি ছি! ভাগিনার সাথে পরকীয়ার সময় হাতেনাতে ধরা খ...,ভিডিওটি ভাল লাগলে লাইক দিন \r\nও সবাইকে দেখার ...,ভিডিওটি লাগলে লাইক দিন ও সবাইকে দেখার সুযোগ কর...,https://www.youtube.com/watch?v=rwxsUpXAozk,50177,23,558,40,https://i.ytimg.com/vi/rwxsUpXAozk/default.jpg,Clickbait,Clickbait,Clickbait
4,UCw4gfo5oaGPkHwarenuewAg,Ruposhi Bangla Tv,https://www.youtube.com/c/RuposhiBanglaTvtopvi...,hyBKq5IBras,2021-08-16T03:45:16Z,হায়রে পরীমনি! কারাগারে গিয়েও ভালো হলোনা! কারাগ...,হায়রে পরীমনি! কারাগারে গিয়েও ভালো হলোনা! কারাগ...,ভিডিওটি ভাল লাগলে লাইক দিন \r\nও সবাইকে দেখার ...,ভিডিওটি লাগলে লাইক দিন ও সবাইকে দেখার সুযোগ কর...,https://www.youtube.com/watch?v=hyBKq5IBras,114242,59,1352,122,https://i.ytimg.com/vi/hyBKq5IBras/default.jpg,Clickbait,Clickbait,Clickbait



Text columns present: ['title', 'title_debiased', 'description', 'description_debiased']
Label columns present: ['auto_labeled', 'human_labeled', 'ai_labeled']

Text columns: non-null count and sample lengths (first 3 non-null):
  - title: 253070 non-null, sample length values (smallest) [17, 18, 19]
  - title_debiased: 253070 non-null, sample length values (smallest) [15, 16, 17]
  - description: 253070 non-null, sample length values (smallest) [48, 49, 51]
  - description_debiased: 253070 non-null, sample length values (smallest) [48, 49, 50]

--- Inspecting label column: auto_labeled ---
dtype: object
sample unique values (up to 20): ['Clickbait', 'Not Clickbait']
value counts (top 10):
auto_labeled
Not Clickbait    223758
Clickbait         29312
Name: count, dtype: int64

--- Inspecting label column: human_labeled ---
dtype: object
sample unique values (up to 20): ['Clickbait', 'Not Clickbait']
value counts (top 10):
human_labeled
None             243070
Clickbait          5644
No

In [None]:
# Step 2: Create frozen BanglaBERT embeddings, prepare features, train logistic regression across configs
# Run in Google Colab (assumes df variable already loaded from Step 1)

# Install deps
!pip install -q transformers accelerate sentencepiece scikit-learn numpy pandas tqdm

import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

# ----------------- USER CONFIG -----------------
# Choose a frozen Bangla encoder.
#  - "sagorsarker/bangla-bert-base"  (BERT-base for Bangla). :contentReference[oaicite:1]{index=1}
#  - "csebuetnlp/banglabert"        (BanglaBERT ELECTRA discriminator). :contentReference[oaicite:2]{index=2}
model_name = "csebuetnlp/banglabert"
# ------------------------------------------------

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# If df not defined (you can uncomment and set file_path)
# file_path = 'your_dataset.parquet'
# df = pd.read_parquet(file_path)

# Ensure the columns we expect exist
text_cols = ['title', 'title_debiased', 'description', 'description_debiased']
present_text = [c for c in text_cols if c in df.columns]
print("Text columns present:", present_text)

label_cols = [c for c in ['auto_labeled', 'human_labeled', 'ai_labeled'] if c in df.columns]
print("Label columns present:", label_cols)

# ----------------- Embedding utilities -----------------
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

def mean_pooling(model_output, attention_mask):
    # model_output[0] is last_hidden_state: (batch, seq_len, hidden)
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = input_mask_expanded.sum(1).clamp(min=1e-9)
    return sum_embeddings / sum_mask

def encode_texts(texts, batch_size=32, cache_path=None):
    """
    texts: iterable/list of strings
    returns: numpy array shape (len(texts), hidden_size)
    Caches to cache_path if provided.
    """
    if cache_path and os.path.exists(cache_path):
        print("Loading embeddings from cache:", cache_path)
        return np.load(cache_path)["arr_0"]
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
        batch_texts = [str(t) if pd.notna(t) else "" for t in texts[i:i+batch_size]]
        encoded = tokenizer(batch_texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
        for k in encoded:
            encoded[k] = encoded[k].to(device)
        with torch.no_grad():
            out = model(**encoded)
            emb = mean_pooling(out, encoded["attention_mask"])  # (batch, hidden)
            emb = emb.cpu().numpy()
        all_embs.append(emb)
    all_embs = np.vstack(all_embs)
    if cache_path:
        np.savez_compressed(cache_path, all_embs)
        print("Saved embeddings to", cache_path)
    return all_embs

# ----------------- Prepare feature texts -----------------
def safe_join(a, b):
    if pd.isna(a) and pd.isna(b):
        return ""
    if pd.isna(a):
        return str(b)
    if pd.isna(b):
        return str(a)
    return str(a) + " " + str(b)

configs = {}
# single fields
for c in present_text:
    configs[c] = df[c].fillna("").astype(str).tolist()
# # concatenations
# if 'title' in present_text and 'description' in present_text:
#     configs['title__description'] = [safe_join(a,b) for a,b in zip(df['title'], df['description'])]
# if 'title_debiased' in present_text and 'description_debiased' in present_text:
#     configs['title_deb__description_deb'] = [safe_join(a,b) for a,b in zip(df['title_debiased'], df['description_debiased'])]

print("Text configurations to embed:", list(configs.keys()))

# ----------------- Compute (or load cached) embeddings for each config -----------------
os.makedirs("embeddings_cache", exist_ok=True)
embeddings = {}
for name, texts in configs.items():
    cache_file = f"embeddings_cache/{name.replace('/','_')}_{model_name.replace('/','_')}.npz"
    # compute and cache
    emb = encode_texts(texts, batch_size=32, cache_path=cache_file)
    embeddings[name] = emb
    print(f"{name}: embeddings shape = {emb.shape}")

# ----------------- Label handling: auto-detect & binarize -----------------
def make_binary_labels(series):
    # series: pd.Series
    ser = series.copy()
    ser_name = ser.name
    # If numeric and only 0/1 -> use as is
    if pd.api.types.is_numeric_dtype(ser):
        unique = np.unique(ser.dropna())
        if set(unique).issubset({0,1}):
            return ser.fillna(0).astype(int)
        # If probabilities [0,1], threshold 0.5
        if (ser.dropna() >= 0).all() and (ser.dropna() <= 1).all():
            print(f"Warning: {ser_name} looks like probabilities, thresholding at 0.5")
            return (ser.fillna(0) >= 0.5).astype(int)
    # If boolean
    if ser.dtype == 'bool':
        return ser.fillna(False).astype(int)
    # If strings: try common labels
    str_vals = ser.dropna().astype(str).str.lower().unique()
    # common positive keywords
    pos_keys = {'clickbait','click-bait','click bait','yes','1','true','y','cb'}
    neg_keys = {'not_clickbait','not-clickbait','not clickbait','not','no','0','false','n','non-cb','nonclickbait','non_clickbait'}
    mapped = []
    for v in ser.astype(str).fillna("nan"):
        vs = v.strip().lower()
        if vs in pos_keys:
            mapped.append(1)
        elif vs in neg_keys:
            mapped.append(0)
        else:
            # unable to confidently map -> treat as NaN for now
            mapped.append(np.nan)
    mapped = pd.Series(mapped, index=ser.index)
    # If too many NaNs, fallback to LabelEncoder (will return integer classes)
    if mapped.isna().mean() > 0.5:
        print(f"Label column {ser_name}: many unmapped textual values — using LabelEncoder fallback.")
        le = LabelEncoder()
        filled = ser.fillna("MISSING").astype(str)
        return pd.Series(le.fit_transform(filled), index=ser.index)
    else:
        return mapped.fillna(0).astype(int)

bin_labels = {}
for lab in label_cols:
    bin_labels[lab] = make_binary_labels(df[lab])
    print(f"{lab}: value counts ->")
    print(bin_labels[lab].value_counts(dropna=False))

# ----------------- Training + evaluation across configs & labels -----------------
results = []
os.makedirs("models", exist_ok=True)

for lab in label_cols:
    y = bin_labels[lab].values
    # Only keep rows where y is not NaN (shouldn't be after our mapping)
    valid_idx = ~pd.isna(y)
    if valid_idx.sum() == 0:
        print(f"Skipping {lab}: no valid labels after mapping.")
        continue
    y = y[valid_idx].astype(int)
    print(f"\n=== Label: {lab}  (n={len(y)}) ===")
    for cfg_name, X_full in embeddings.items():
        X = X_full[valid_idx]
        # standardize
        scaler = StandardScaler()
        Xs = scaler.fit_transform(X)
        # train/test split (stratify when possible)
        stratify = y if len(np.unique(y)) > 1 else None
        X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.2, random_state=42, stratify=stratify)
        # logistic regression (liblinear for small samples; sag for larger)
        solver = "liblinear" if X_train.shape[0] < 5000 else "saga"
        clf = LogisticRegression(max_iter=1000, solver=solver)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        # probabilities for AUC if possible
        try:
            y_prob = clf.predict_proba(X_test)[:,1]
            auc = roc_auc_score(y_test, y_prob)
        except Exception:
            auc = np.nan
        res = {
            "label_col": lab,
            "text_config": cfg_name,
            "n_train": X_train.shape[0],
            "n_test": X_test.shape[0],
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, zero_division=0),
            "recall": recall_score(y_test, y_pred, zero_division=0),
            "f1": f1_score(y_test, y_pred, zero_division=0),
            "roc_auc": auc
        }
        print(f"{lab} | {cfg_name} -> acc {res['accuracy']:.3f} prec {res['precision']:.3f} rec {res['recall']:.3f} f1 {res['f1']:.3f} auc {res['roc_auc']}")
        results.append(res)
        # Save model and scaler
        model_fname = f"models/logreg_{lab}_{cfg_name}.joblib".replace("/","_")
        joblib.dump({"clf": clf, "scaler": scaler, "model_name": model_name}, model_fname)

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv("embedding_logreg_results.csv", index=False)
print("\nSaved results to embedding_logreg_results.csv")
display(results_df.sort_values(['label_col','f1'], ascending=[True, False]).head(20))


Device: cuda
Text columns present: ['title', 'title_debiased', 'description', 'description_debiased']
Label columns present: ['auto_labeled', 'human_labeled', 'ai_labeled']
Text configurations to embed: ['title', 'title_debiased', 'description', 'description_debiased']
Loading embeddings from cache: embeddings_cache/title_csebuetnlp_banglabert.npz
title: embeddings shape = (253070, 768)
Loading embeddings from cache: embeddings_cache/title_debiased_csebuetnlp_banglabert.npz
title_debiased: embeddings shape = (253070, 768)


Embedding batches:   0%|          | 0/7909 [00:00<?, ?it/s]

In [None]:
# Step 3: Robust evaluation (Stratified K-Fold CV, confusion matrices, ROC/PR curves, optional bootstrap CIs)
# Run in Google Colab (expects embeddings cached in embeddings_cache/ OR `embeddings` dict present)

!pip install -q scikit-learn matplotlib seaborn numpy pandas joblib

import os, glob, math
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
                             confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score)
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

# --------- User options ----------
n_splits = 5
random_state = 42
compute_bootstrap = True   # set False to skip bootstrap CIs
n_bootstrap = 1000
bootstrap_sample_size = 0.8  # proportion of test sample for each bootstrap draw
# ---------------------------------

os.makedirs("cv_results", exist_ok=True)
os.makedirs("cv_plots", exist_ok=True)

# Try to reuse embeddings dict from session; otherwise load from cache files
try:
    embeddings  # if variable defined, use it
except NameError:
    embeddings = {}
    # load any .npz files in embeddings_cache
    files = sorted(glob.glob("embeddings_cache/*.npz"))
    if not files:
        raise FileNotFoundError("No `embeddings` variable and no embeddings_cache/*.npz found. Rerun Step 2 to produce embeddings.")
    for fpath in files:
        name = os.path.basename(fpath).split(".npz")[0]
        # tidy name: remove model postfix if present
        name = name.replace("_" + name.split("_")[-1], "") if False else name
        arr = np.load(fpath)["arr_0"]
        embeddings[name] = arr
    print("Loaded embeddings:", list(embeddings.keys()))

# Load labels (bin_labels) if available in session, otherwise try to reconstruct from file 'sample_first_200.csv' or df
try:
    bin_labels
except NameError:
    # try to infer from df if present
    try:
        # use the same make_binary_labels logic as Step 2
        from sklearn.preprocessing import LabelEncoder
        def make_binary_labels(series):
            ser = series.copy()
            if pd.api.types.is_numeric_dtype(ser):
                unique = np.unique(ser.dropna())
                if set(unique).issubset({0,1}):
                    return ser.fillna(0).astype(int)
                if (ser.dropna() >= 0).all() and (ser.dropna() <= 1).all():
                    return (ser.fillna(0) >= 0.5).astype(int)
            if ser.dtype == 'bool':
                return ser.fillna(False).astype(int)
            str_vals = ser.dropna().astype(str).str.lower().unique()
            pos_keys = {'clickbait','click-bait','click bait','yes','1','true','y','cb'}
            neg_keys = {'not_clickbait','not-clickbait','not clickbait','not','no','0','false','n','non-cb','nonclickbait','non_clickbait'}
            mapped = []
            for v in ser.astype(str).fillna("nan"):
                vs = v.strip().lower()
                if vs in pos_keys:
                    mapped.append(1)
                elif vs in neg_keys:
                    mapped.append(0)
                else:
                    mapped.append(np.nan)
            mapped = pd.Series(mapped, index=ser.index)
            if mapped.isna().mean() > 0.5:
                le = LabelEncoder()
                filled = ser.fillna("MISSING").astype(str)
                return pd.Series(le.fit_transform(filled), index=ser.index)
            else:
                return mapped.fillna(0).astype(int)
        # build bin_labels for available label columns in df
        label_cols = [c for c in ['auto_labeled','human_labeled','ai_labeled'] if c in df.columns]
        bin_labels = {lab: make_binary_labels(df[lab]) for lab in label_cols}
        print("Reconstructed bin_labels for:", list(bin_labels.keys()))
    except Exception as e:
        raise RuntimeError("Could not find `bin_labels` or construct from df. Ensure Step 2 ran and `bin_labels` variable exists.") from e

# Utility: compute metrics
def compute_metrics(y_true, y_pred, y_prob=None):
    m = {}
    m['accuracy'] = accuracy_score(y_true, y_pred)
    m['precision'] = precision_score(y_true, y_pred, zero_division=0)
    m['recall'] = recall_score(y_true, y_pred, zero_division=0)
    m['f1'] = f1_score(y_true, y_pred, zero_division=0)
    if y_prob is not None and len(np.unique(y_true)) > 1:
        try:
            m['roc_auc'] = roc_auc_score(y_true, y_prob)
        except Exception:
            m['roc_auc'] = np.nan
        try:
            m['avg_precision'] = average_precision_score(y_true, y_prob)
        except Exception:
            m['avg_precision'] = np.nan
    else:
        m['roc_auc'] = np.nan
        m['avg_precision'] = np.nan
    return m

# Main CV loop
summary_rows = []
detailed_rows = []  # store per-fold metrics
plot_index = 0

for lab, y_series in bin_labels.items():
    print(f"\n==== Label: {lab} ====")
    label_name = lab
    y_all = np.array(y_series)
    valid_idx = ~pd.isna(y_all)
    y_all = y_all[valid_idx].astype(int)
    # if all labels same class, skip CV
    if len(np.unique(y_all)) < 2:
        print(f"Skipping {lab}: only one class present after binarization.")
        continue

    for cfg_name, X_full in embeddings.items():
        X = X_full[valid_idx]
        # standardize per CV fold (scaler inside fold)
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        fold = 0

        # containers for aggregating metrics & curves
        metrics_list = []
        cm_sum = np.zeros((2,2), dtype=int)
        # For ROC/PR aggregation: collect interp TPR at fixed FPR grid
        mean_fpr = np.linspace(0,1,200)
        tprs = []
        mean_recall = np.linspace(0,1,200)
        precisions_interp = []

        plt.figure(figsize=(6,5))
        # ROC plot base
        for train_idx, test_idx in skf.split(X, y_all):
            fold += 1
            X_train_raw, X_test_raw = X[train_idx], X[test_idx]
            y_train, y_test = y_all[train_idx], y_all[test_idx]

            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train_raw)
            X_test = scaler.transform(X_test_raw)

            solver = "liblinear" if X_train.shape[0] < 5000 else "saga"
            clf = LogisticRegression(max_iter=1000, solver=solver)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            try:
                y_prob = clf.predict_proba(X_test)[:,1]
            except Exception:
                y_prob = None

            # metrics
            m = compute_metrics(y_test, y_pred, y_prob)
            m['fold'] = fold
            metrics_list.append(m)
            detailed_row = {
                "label_col": label_name, "text_config": cfg_name, "fold": fold,
                **m
            }
            detailed_rows.append(detailed_row)

            # confusion matrix
            cm = confusion_matrix(y_test, y_pred, labels=[0,1])
            cm_sum += cm

            # ROC curve points
            if y_prob is not None:
                fpr, tpr, _ = roc_curve(y_test, y_prob)
                # interpolate TPR at mean_fpr
                tpr_interp = np.interp(mean_fpr, fpr, tpr)
                tpr_interp[0] = 0.0
                tprs.append(tpr_interp)
                # precision-recall
                precision, recall, _ = precision_recall_curve(y_test, y_prob)
                # interp precision at fixed recall grid (monotonic decreasing recall -> reverse for interp)
                # ensure recall is increasing for interpolation
                recall_rev = recall[::-1]
                precision_rev = precision[::-1]
                prec_interp = np.interp(mean_recall, recall_rev, precision_rev, left=precision_rev[0], right=precision_rev[-1])
                precisions_interp.append(prec_interp)

        # aggregate metrics
        metrics_df = pd.DataFrame(metrics_list)
        agg = metrics_df[['accuracy','precision','recall','f1','roc_auc','avg_precision']].agg(['mean','std']).T
        summary = {
            "label_col": label_name,
            "text_config": cfg_name,
            "n_folds": n_splits,
            "n_samples": len(y_all),
            "class_counts_0": int((y_all==0).sum()),
            "class_counts_1": int((y_all==1).sum()),
            "accuracy_mean": agg.loc['accuracy','mean'],
            "accuracy_std": agg.loc['accuracy','std'],
            "precision_mean": agg.loc['precision','mean'],
            "precision_std": agg.loc['precision','std'],
            "recall_mean": agg.loc['recall','mean'],
            "recall_std": agg.loc['recall','std'],
            "f1_mean": agg.loc['f1','mean'],
            "f1_std": agg.loc['f1','std'],
            "roc_auc_mean": agg.loc['roc_auc','mean'] if not np.isnan(agg.loc['roc_auc','mean']) else np.nan,
            "roc_auc_std": agg.loc['roc_auc','std'] if not np.isnan(agg.loc['roc_auc','std']) else np.nan,
            "avg_precision_mean": agg.loc['avg_precision','mean'] if not np.isnan(agg.loc['avg_precision','mean']) else np.nan,
            "avg_precision_std": agg.loc['avg_precision','std'] if not np.isnan(agg.loc['avg_precision','std']) else np.nan
        }
        summary_rows.append(summary)

        # Save confusion matrix heatmap
        cm_df = pd.DataFrame(cm_sum, index=["pred_0?","pred_1?"], columns=["true_0","true_1"])
        plt.figure(figsize=(4,3))
        sns.heatmap(cm_sum, annot=True, fmt="d", cmap="Blues", xticklabels=["true_0","true_1"], yticklabels=["pred_0","pred_1"])
        plt.title(f"Confusion Matrix: {label_name} | {cfg_name}")
        cm_path = f"cv_plots/cm_{label_name}_{cfg_name}.png".replace("/","_")
        plt.savefig(cm_path, bbox_inches="tight", dpi=150)
        plt.close()

        # Plot mean ROC curve across folds
        if tprs:
            mean_tpr = np.mean(tprs, axis=0)
            mean_tpr[-1] = 1.0
            mean_auc = auc(mean_fpr, mean_tpr)
            std_tpr = np.std(tprs, axis=0)

            plt.figure(figsize=(6,5))
            plt.plot(mean_fpr, mean_tpr, lw=2, label=f"Mean ROC (AUC = {mean_auc:.3f})")
            tpr_upper = np.minimum(mean_tpr + std_tpr, 1)
            tpr_lower = np.maximum(mean_tpr - std_tpr, 0)
            plt.fill_between(mean_fpr, tpr_lower, tpr_upper, alpha=0.2, label="±1 std")
            plt.plot([0,1],[0,1], linestyle="--", lw=1)
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.title(f"ROC: {label_name} | {cfg_name}")
            plt.legend(loc="lower right")
            roc_path = f"cv_plots/roc_{label_name}_{cfg_name}.png".replace("/","_")
            plt.savefig(roc_path, bbox_inches="tight", dpi=150)
            plt.close()
        else:
            roc_path = None

        # Plot mean PR curve across folds
        if precisions_interp:
            mean_prec = np.mean(precisions_interp, axis=0)
            std_prec = np.std(precisions_interp, axis=0)
            mean_ap = np.nanmean([r['avg_precision'] for r in metrics_list if not np.isnan(r['avg_precision'])])
            plt.figure(figsize=(6,5))
            plt.plot(mean_recall, mean_prec, lw=2, label=f"Mean PR (AP ≈ {mean_ap:.3f})")
            prec_upper = np.minimum(mean_prec + std_prec, 1)
            prec_lower = np.maximum(mean_prec - std_prec, 0)
            plt.fill_between(mean_recall, prec_lower, prec_upper, alpha=0.2)
            plt.xlabel("Recall")
            plt.ylabel("Precision")
            plt.title(f"Precision-Recall: {label_name} | {cfg_name}")
            plt.legend(loc="upper right")
            pr_path = f"cv_plots/pr_{label_name}_{cfg_name}.png".replace("/","_")
            plt.savefig(pr_path, bbox_inches="tight", dpi=150)
            plt.close()
        else:
            pr_path = None

        # Optional bootstrap CI for F1: aggregate across folds by combining test sets?
        # We'll bootstrap within each fold's test predictions to estimate CI of F1 for that fold, then pool.
        if compute_bootstrap:
            # For reproducible bootstrap, re-run one train/test split per fold and keep predictions
            # We'll gather all test predictions by re-training on full dataset with KFold splits and storing per-fold predictions
            boot_f1s = []
            for train_idx, test_idx in skf.split(X, y_all):
                X_train_raw, X_test_raw = X[train_idx], X[test_idx]
                y_train, y_test = y_all[train_idx], y_all[test_idx]
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train_raw)
                X_test = scaler.transform(X_test_raw)
                solver = "liblinear" if X_train.shape[0] < 5000 else "saga"
                clf = LogisticRegression(max_iter=1000, solver=solver)
                clf.fit(X_train, y_train)
                try:
                    y_prob = clf.predict_proba(X_test)[:,1]
                except Exception:
                    y_prob = None
                y_pred = clf.predict(X_test)
                # now bootstrap over test indices
                for _ in range(n_bootstrap // n_splits):
                    # sample indices with replacement from test set
                    m = max(1, int(len(test_idx) * bootstrap_sample_size))
                    sample_idx = np.random.choice(len(test_idx), size=m, replace=True)
                    y_s = y_test[sample_idx]
                    y_p = y_pred[sample_idx]
                    boot_f1s.append(f1_score(y_s, y_p, zero_division=0))
            if len(boot_f1s) > 0:
                lo = np.percentile(boot_f1s, 2.5)
                hi = np.percentile(boot_f1s, 97.5)
            else:
                lo, hi = (np.nan, np.nan)
        else:
            lo, hi = (np.nan, np.nan)

        # record summary, include paths to plots
        summary.update({
            "confusion_matrix_path": cm_path,
            "roc_plot_path": roc_path,
            "pr_plot_path": pr_path,
            "f1_bootstrap_2.5pct": lo,
            "f1_bootstrap_97.5pct": hi
        })
        # Save detailed per-fold metrics to CSV for this pair
        pair_df = pd.DataFrame([r for r in detailed_rows if r['label_col']==label_name and r['text_config']==cfg_name])
        if pair_df.shape[0] > 0:
            pair_csv = f"cv_results/{label_name}__{cfg_name}__folds.csv".replace("/","_")
            pair_df.to_csv(pair_csv, index=False)
        # Add to summary file
        # (we append later after loop)

# Save summary and detailed CSVs
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("cv_results_summary.csv", index=False)
detailed_df = pd.DataFrame(detailed_rows)
detailed_df.to_csv("cv_results_detailed.csv", index=False)

print("\nSaved summary CSV: cv_results_summary.csv")
print("Saved detailed fold metrics: cv_results_detailed.csv")
print("Saved confusion matrices and curves under cv_plots/ and per-pair folds under cv_results/")

# Show top results by f1_mean
if not summary_df.empty:
    display(summary_df.sort_values("f1_mean", ascending=False).head(20))
