# Concept_Format

In [None]:
#loading libraries
import os
import random
import time
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from datasets import Dataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
)
from sklearn.preprocessing import label_binarize

from setfit import SetFitModel, SetFitTrainer


# no WandB logging, fewer side messages
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


LABELED_PATH = "/content/Sample_500_stratified_Concept-Format.xlsx"
FULL_DATA_PATH = "/content/Restaurants_Data.xlsx"
OUT_DIR = "/content/results_concept_format_final"
os.makedirs(OUT_DIR, exist_ok=True)


BASE_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

NUM_ITERATIONS = 20
NUM_EPOCHS = 2
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

CV_FOLDS_MAX = 3
SPLIT_SEED = 42
FINAL_TRAIN_SEED = 42

MIN_SAMPLES_PER_CLASS = 2
RARE_CLASS_LABEL = "Other / very rare"

TASK_NAME = "concept_format"
LABEL_COL = "concept_format"
FEATURE_COLS = ["desc_1", "desc_2", "title"]


def set_all_seeds(seed: int) -> None:
    # ensure reproducibility across random, numpy and torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_text(df: pd.DataFrame, feature_cols: List[str]) -> pd.Series:
    # concatenate text features using a separator token
    X = df[feature_cols].copy()
    for c in feature_cols:
        X[c] = X[c].fillna("").astype(str).str.strip()

    return (
        X.agg(" [SEP] ".join, axis=1)
        .astype(str)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )


def bundle_rare_classes(y: pd.Series, min_count: int, rare_label: str) -> pd.Series:
    # merge very small classes into one fallback category
    vc = y.value_counts(dropna=False)
    rare = vc[vc < min_count].index
    if len(rare) == 0:
        return y
    return y.where(~y.isin(rare), other=rare_label)


def drop_split_impossible_singletons(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    # for stratified CV, each class needs at least 2 samples
    # if any class still has count=1, remove those rows to avoid split errors
    vc = df[label_col].value_counts()
    bad = vc[vc < 2]
    if len(bad) == 0:
        return df

    print("\nWarning: Some classes still have only 1 sample. These rows will be removed:")
    print(bad.to_string())
    keep_mask = ~df[label_col].isin(bad.index)
    dropped = (~keep_mask).sum()
    print(f"Removed rows: {dropped}\n")
    return df.loc[keep_mask].reset_index(drop=True)


def compute_macro_metrics(y_true: List[str], y_pred: List[str], labels: List[str]) -> Dict[str, float]:
    # compute macro-averaged classification metrics
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    return {
        "accuracy": float(acc),
        "precision_macro": float(p),
        "recall_macro": float(r),
        "f1_macro": float(f1),
    }


def to_numpy_proba(x) -> Optional[np.ndarray]:
    # safely convert probabilities to numpy
    if x is None:
        return None
    if isinstance(x, torch.Tensor):
        return x.detach().cpu().numpy()
    return np.asarray(x)


def train_setfit_one_fold(
    X_train: List[str],
    y_train: List[str],
    X_eval: List[str],
    class_list: List[str],
    seed: int,
) -> Tuple[np.ndarray, Optional[np.ndarray], float]:
    # train and evaluate one fold
    set_all_seeds(seed)

    train_ds = Dataset.from_dict({"text": X_train, "label": y_train})

    model = SetFitModel.from_pretrained(
        BASE_MODEL,
        labels=class_list,
        head_params={"class_weight": "balanced", "max_iter": 2000},
    )

    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_ds,
        column_mapping={"text": "text", "label": "label"},
        batch_size=BATCH_SIZE,
        num_epochs=NUM_EPOCHS,
        num_iterations=NUM_ITERATIONS,
        learning_rate=LEARNING_RATE,
        seed=seed,
    )

    t0 = time.time()
    trainer.train()
    train_time = time.time() - t0

    y_pred = np.asarray(model.predict(X_eval), dtype=object)

    y_prob = None
    try:
        y_prob = to_numpy_proba(model.predict_proba(X_eval))
    except Exception:
        y_prob = None

    del trainer, model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return y_pred, y_prob, float(train_time)


def plot_and_save_confusion_matrix(cm_norm: np.ndarray, labels: List[str], title: str, out_png: str) -> None:
    # plot normalized confusion matrix and save to disk
    plt.figure(figsize=(10, 8))
    plt.imshow(cm_norm, aspect="auto")
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=45, ha="right")
    plt.yticks(range(len(labels)), labels)
    plt.title(title)
    plt.xlabel("Predicted class")
    plt.ylabel("True class")

    for i in range(cm_norm.shape[0]):
        for j in range(cm_norm.shape[1]):
            plt.text(j, i, f"{cm_norm[i, j]:.2f}", ha="center", va="center")

    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.show()


def plot_roc_pr_micro(task_name: str, y_true: List[str], y_prob: Optional[np.ndarray], class_list: List[str]) -> None:
    # micro-averaged ROC and PR curves
    y_prob = to_numpy_proba(y_prob)
    if y_prob is None:
        print(f"{task_name}: no predict_proba values -> skipping ROC/PR.")
        return

    y_true_arr = np.asarray(y_true, dtype=object)
    y_true_bin = label_binarize(y_true_arr, classes=class_list)

    fpr, tpr, _ = roc_curve(y_true_bin.ravel(), y_prob.ravel())
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC (micro) — {task_name} (AUC={roc_auc:.3f})")
    roc_png = os.path.join(OUT_DIR, f"{TASK_NAME}_roc_micro.png")
    plt.tight_layout()
    plt.savefig(roc_png, dpi=200)
    plt.show()

    prec, rec, _ = precision_recall_curve(y_true_bin.ravel(), y_prob.ravel())
    ap = average_precision_score(y_true_bin, y_prob, average="micro")

    plt.figure()
    plt.plot(rec, prec)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"PR (micro) — {task_name} (AP={ap:.3f})")
    pr_png = os.path.join(OUT_DIR, f"{TASK_NAME}_pr_micro.png")
    plt.tight_layout()
    plt.savefig(pr_png, dpi=200)
    plt.show()

    print(f"{task_name}: ROC-AUC(micro)={roc_auc:.4f} | AP(micro)={ap:.4f}")
    print(f"Plots saved: {roc_png} and {pr_png}")


t_all = time.time()

df_labeled = pd.read_excel(LABELED_PATH)
df_labeled.columns = df_labeled.columns.astype(str).str.strip()

df_full = pd.read_excel(FULL_DATA_PATH)
df_full.columns = df_full.columns.astype(str).str.strip()

missing_labeled = [c for c in ([LABEL_COL] + FEATURE_COLS) if c not in df_labeled.columns]
if missing_labeled:
    raise ValueError(f"Missing columns in labeled file: {missing_labeled}")

missing_full = [c for c in FEATURE_COLS if c not in df_full.columns]
if missing_full:
    raise ValueError(f"Missing feature columns in full file: {missing_full}")

d = df_labeled[FEATURE_COLS + [LABEL_COL]].copy()
d[LABEL_COL] = d[LABEL_COL].astype(str).str.strip()
d = d.dropna(subset=[LABEL_COL])

d["text"] = build_text(d, FEATURE_COLS)
d["label"] = d[LABEL_COL].astype(str).str.strip()

# remove empty strings and textual nan/none
d = d[(d["text"].str.strip() != "") & (d["label"].str.strip() != "")]
d = d[~d["label"].str.lower().isin(["nan", "none"])].reset_index(drop=True)

if len(d) < 30 or d["label"].nunique() < 2:
    raise ValueError(f"Too few samples/classes after cleaning: n={len(d)}, k={d['label'].nunique()}")

d["label"] = bundle_rare_classes(d["label"], MIN_SAMPLES_PER_CLASS, RARE_CLASS_LABEL)
d = drop_split_impossible_singletons(d, "label")

class_list = sorted(d["label"].unique().tolist())
X_all = d["text"].tolist()
y_all = d["label"].tolist()

print(f"\nTask: {TASK_NAME}")
print(f"Samples (cleaned): {len(d)} | Classes: {len(class_list)}")
print(f"Features: {FEATURE_COLS} | Label: {LABEL_COL}")
print(f"Model: {BASE_MODEL}")
print(f"Config: iters={NUM_ITERATIONS}, epochs={NUM_EPOCHS}, bs={BATCH_SIZE}, lr={LEARNING_RATE}")
print(f"Max CV folds: {CV_FOLDS_MAX}")

min_count = pd.Series(y_all).value_counts().min()
folds_eff = min(CV_FOLDS_MAX, int(min_count))

N = len(y_all)
K = len(class_list)
oof_pred = np.empty(N, dtype=object)
oof_prob = np.full((N, K), np.nan, dtype=float)
fold_rows = []

X_arr = np.asarray(X_all, dtype=object)
y_arr = np.asarray(y_all, dtype=object)

if folds_eff < 2:
    print(f"\nCV not meaningful here (folds_eff={folds_eff}). Using a 20% holdout split instead.")
    try:
        X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
            X_arr, y_arr, np.arange(N),
            test_size=0.2, random_state=SPLIT_SEED, stratify=y_arr
        )
    except Exception:
        X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
            X_arr, y_arr, np.arange(N),
            test_size=0.2, random_state=SPLIT_SEED, stratify=None
        )

    pred_te, prob_te, tsec = train_setfit_one_fold(
        X_train=X_tr.tolist(),
        y_train=y_tr.tolist(),
        X_eval=X_te.tolist(),
        class_list=class_list,
        seed=SPLIT_SEED,
    )

    oof_pred[idx_te] = pred_te
    prob_te_np = to_numpy_proba(prob_te)
    if prob_te_np is not None and prob_te_np.shape == (len(idx_te), K):
        oof_prob[idx_te, :] = prob_te_np

    m = compute_macro_metrics(y_te.tolist(), pred_te.tolist(), class_list)
    fold_rows.append({"fold": 1, "train_time_sec": tsec, **m})

    eval_mask = np.zeros(N, dtype=bool)
    eval_mask[idx_te] = True
    y_eval_true = y_arr[eval_mask].tolist()
    y_eval_pred = oof_pred[eval_mask].tolist()
    y_eval_prob = oof_prob[eval_mask, :] if not np.isnan(oof_prob[eval_mask, :]).any() else None

else:
    print(f"\nStarting StratifiedKFold CV with folds_eff={folds_eff}")
    skf = StratifiedKFold(n_splits=folds_eff, shuffle=True, random_state=SPLIT_SEED)

    for fold, (tr_idx, te_idx) in enumerate(skf.split(X_arr, y_arr), start=1):
        X_tr = X_arr[tr_idx].tolist()
        y_tr = y_arr[tr_idx].tolist()
        X_te = X_arr[te_idx].tolist()
        y_te = y_arr[te_idx].tolist()

        pred_te, prob_te, tsec = train_setfit_one_fold(
            X_train=X_tr,
            y_train=y_tr,
            X_eval=X_te,
            class_list=class_list,
            seed=SPLIT_SEED + fold,
        )

        oof_pred[te_idx] = pred_te

        prob_te_np = to_numpy_proba(prob_te)
        if prob_te_np is not None and prob_te_np.shape == (len(te_idx), K):
            oof_prob[te_idx, :] = prob_te_np

        m = compute_macro_metrics(y_te, pred_te.tolist(), class_list)
        fold_rows.append({"fold": fold, "train_time_sec": tsec, **m})

    y_eval_true = y_arr.tolist()
    y_eval_pred = oof_pred.tolist()
    y_eval_prob = None if np.isnan(oof_prob).any() else oof_prob

fold_df = pd.DataFrame(fold_rows)
print("\nResults per fold:")
print(fold_df[["fold", "accuracy", "precision_macro", "recall_macro", "f1_macro", "train_time_sec"]].to_string(index=False))

print("\nMean ± Std:")
print(
    f"Accuracy:  {fold_df['accuracy'].mean():.4f} ± {fold_df['accuracy'].std(ddof=0):.4f}\n"
    f"Precision: {fold_df['precision_macro'].mean():.4f} ± {fold_df['precision_macro'].std(ddof=0):.4f}\n"
    f"Recall:    {fold_df['recall_macro'].mean():.4f} ± {fold_df['recall_macro'].std(ddof=0):.4f}\n"
    f"F1 (macro): {fold_df['f1_macro'].mean():.4f} ± {fold_df['f1_macro'].std(ddof=0):.4f}"
)

fold_csv = os.path.join(OUT_DIR, f"{TASK_NAME}_folds.csv")
fold_df.to_csv(fold_csv, index=False)
print(f"\nFold table saved: {fold_csv}")

print("\nClassification Report (OOF):")
report_txt = classification_report(
    y_eval_true, y_eval_pred, labels=class_list, zero_division=0
)
print(report_txt)

report_path = os.path.join(OUT_DIR, f"{TASK_NAME}_classification_report.txt")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(report_txt)
print(f"Report saved: {report_path}")

report_dict = classification_report(
    y_eval_true, y_eval_pred, labels=class_list, output_dict=True, zero_division=0
)
report_df = pd.DataFrame(report_dict).transpose()
report_csv = os.path.join(OUT_DIR, f"{TASK_NAME}_classification_report.csv")
report_df.to_csv(report_csv, index=True)
print(f"Report (CSV) saved: {report_csv}")

cm_norm = confusion_matrix(y_eval_true, y_eval_pred, labels=class_list, normalize="true")
cm_png = os.path.join(OUT_DIR, f"{TASK_NAME}_confusion_matrix_normalized.png")
plot_and_save_confusion_matrix(
    cm_norm=cm_norm,
    labels=class_list,
    title=f"Confusion Matrix (normalized) — {TASK_NAME}",
    out_png=cm_png,
)
print(f"Confusion matrix saved: {cm_png}")

plot_roc_pr_micro(
    task_name=f"{TASK_NAME} (OOF)",
    y_true=y_eval_true,
    y_prob=y_eval_prob,
    class_list=class_list,
)

print(f"\nFinal training on all labeled data for '{TASK_NAME}' ...")

set_all_seeds(FINAL_TRAIN_SEED)
train_ds_all = Dataset.from_dict({"text": X_all, "label": y_all})

model_final = SetFitModel.from_pretrained(
    BASE_MODEL,
    labels=class_list,
    head_params={"class_weight": "balanced", "max_iter": 2000},
)

trainer_final = SetFitTrainer(
    model=model_final,
    train_dataset=train_ds_all,
    column_mapping={"text": "text", "label": "label"},
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    num_iterations=NUM_ITERATIONS,
    learning_rate=LEARNING_RATE,
    seed=FINAL_TRAIN_SEED,
)

t0 = time.time()
trainer_final.train()
print(f"Final training finished after {time.time() - t0:.1f} seconds")

X_full = build_text(df_full, FEATURE_COLS).tolist()
preds_full = model_final.predict(X_full)

pred_col = f"{LABEL_COL}_pred"
df_full[pred_col] = np.asarray(preds_full, dtype=object)

full_out = os.path.join(OUT_DIR, "FULL_60000_with_concept_format_predictions.xlsx")
df_full.to_excel(full_out, index=False)

print(f"\nFull predictions saved: {full_out}")
print(f"Total runtime: {time.time() - t_all:.1f} seconds")

# Cuisine Region

In [None]:
# Loading Libraries
import os
import random
import time
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from datasets import Dataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
)
from sklearn.preprocessing import label_binarize

from setfit import SetFitModel, SetFitTrainer


# disable wandb 
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


LABELED_PATH = "/content/Sample_500_stratified.xlsx"
FULL_DATA_PATH = "/content/Restaurants_Data.xlsx"
OUT_DIR = "/content/results_cuisine_region_final"
os.makedirs(OUT_DIR, exist_ok=True)


BASE_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

NUM_ITERATIONS = 20
NUM_EPOCHS = 3
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

CV_FOLDS_MAX = 3
SPLIT_SEED = 42
FINAL_TRAIN_SEED = 42

MIN_SAMPLES_PER_CLASS = 2
RARE_CLASS_LABEL = "Other / very rare"

TASK_NAME = "cuisine_region"
LABEL_COL = "cuisine_region"
FEATURE_COLS = ["desc_1", "desc_2"]


def set_all_seeds(seed: int) -> None:
    # make results reproducible
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_text(df: pd.DataFrame, feature_cols: List[str]) -> pd.Series:
    # concatenate selected text fields with separator token
    X = df[feature_cols].copy()
    for c in feature_cols:
        X[c] = X[c].fillna("").astype(str).str.strip()
    return (
        X.agg(" [SEP] ".join, axis=1)
        .astype(str)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )


def bundle_rare_classes(y: pd.Series, min_count: int, rare_label: str) -> pd.Series:
    # merge very small classes into one fallback label
    vc = y.value_counts(dropna=False)
    rare = vc[vc < min_count].index
    if len(rare) == 0:
        return y
    return y.where(~y.isin(rare), other=rare_label)


def drop_split_impossible_singletons(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    # stratified CV requires at least 2 samples per class
    vc = df[label_col].value_counts()
    bad = vc[vc < 2]
    if len(bad) == 0:
        return df

    print("\nNote: Some classes still have only 1 sample. These rows will be removed; otherwise CV will fail.")
    print(bad.to_string())
    keep_mask = ~df[label_col].isin(bad.index)
    dropped = int((~keep_mask).sum())
    print(f"Removed rows: {dropped}\n")
    return df.loc[keep_mask].reset_index(drop=True)


def compute_macro_metrics(y_true: List[str], y_pred: List[str], labels: List[str]) -> Dict[str, float]:
    # compute macro-averaged classification metrics
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    return {
        "accuracy": float(acc),
        "precision_macro": float(p),
        "recall_macro": float(r),
        "f1_macro": float(f1),
    }


def to_numpy_proba(x) -> Optional[np.ndarray]:
    # convert predict_proba output safely to numpy
    if x is None:
        return None
    if isinstance(x, torch.Tensor):
        return x.detach().cpu().numpy()
    return np.asarray(x)


def train_setfit_one_fold(
    X_train: List[str],
    y_train: List[str],
    X_eval: List[str],
    class_list: List[str],
    seed: int,
) -> Tuple[np.ndarray, Optional[np.ndarray], float]:
    # train and evaluate one CV fold
    set_all_seeds(seed)

    train_ds = Dataset.from_dict({"text": X_train, "label": y_train})

    model = SetFitModel.from_pretrained(
        BASE_MODEL,
        labels=class_list,
        head_params={"class_weight": "balanced", "max_iter": 2000},
    )

    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_ds,
        column_mapping={"text": "text", "label": "label"},
        batch_size=BATCH_SIZE,
        num_epochs=NUM_EPOCHS,
        num_iterations=NUM_ITERATIONS,
        learning_rate=LEARNING_RATE,
        seed=seed,
    )

    t0 = time.time()
    trainer.train()
    train_time = time.time() - t0

    y_pred = np.asarray(model.predict(X_eval), dtype=object)

    y_prob = None
    try:
        y_prob = to_numpy_proba(model.predict_proba(X_eval))
    except Exception:
        y_prob = None

    del trainer, model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return y_pred, y_prob, float(train_time)


def plot_and_save_confusion_matrix(cm_norm: np.ndarray, labels: List[str], title: str, out_png: str) -> None:
    # plot normalized confusion matrix
    plt.figure(figsize=(10, 8))
    plt.imshow(cm_norm, aspect="auto")
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=45, ha="right")
    plt.yticks(range(len(labels)), labels)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")

    for i in range(cm_norm.shape[0]):
        for j in range(cm_norm.shape[1]):
            plt.text(j, i, f"{cm_norm[i, j]:.2f}", ha="center", va="center")

    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.show()


def plot_roc_pr_micro(task_name: str, y_true: List[str], y_prob: Optional[np.ndarray], class_list: List[str]) -> None:
    # micro-averaged ROC and PR curves
    y_prob = to_numpy_proba(y_prob)
    if y_prob is None:
        print(f"{task_name}: no probabilities -> skipping ROC/PR.")
        return

    y_true_arr = np.asarray(y_true, dtype=object)
    y_true_bin = label_binarize(y_true_arr, classes=class_list)

    fpr, tpr, _ = roc_curve(y_true_bin.ravel(), y_prob.ravel())
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC (micro) — {task_name} (AUC={roc_auc:.3f})")
    roc_png = os.path.join(OUT_DIR, f"{TASK_NAME}_roc_micro.png")
    plt.tight_layout()
    plt.savefig(roc_png, dpi=200)
    plt.show()

    prec, rec, _ = precision_recall_curve(y_true_bin.ravel(), y_prob.ravel())
    ap = average_precision_score(y_true_bin, y_prob, average="micro")

    plt.figure()
    plt.plot(rec, prec)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"PR (micro) — {task_name} (AP={ap:.3f})")
    pr_png = os.path.join(OUT_DIR, f"{TASK_NAME}_pr_micro.png")
    plt.tight_layout()
    plt.savefig(pr_png, dpi=200)
    plt.show()

    print(f"{task_name}: ROC-AUC(micro)={roc_auc:.4f} | AP(micro)={ap:.4f}")
    print(f"Plots saved:\n- {roc_png}\n- {pr_png}")


t_all = time.time()

df_labeled = pd.read_excel(LABELED_PATH)
df_labeled.columns = df_labeled.columns.astype(str).str.strip()

df_full = pd.read_excel(FULL_DATA_PATH)
df_full.columns = df_full.columns.astype(str).str.strip()

missing_labeled = [c for c in ([LABEL_COL] + FEATURE_COLS) if c not in df_labeled.columns]
if missing_labeled:
    raise ValueError(f"Missing columns in labeled file: {missing_labeled}")

missing_full = [c for c in FEATURE_COLS if c not in df_full.columns]
if missing_full:
    raise ValueError(f"Missing feature columns in full file: {missing_full}")

d = df_labeled[FEATURE_COLS + [LABEL_COL]].copy()
d[LABEL_COL] = d[LABEL_COL].astype(str).str.strip()
d = d.dropna(subset=[LABEL_COL])

d["text"] = build_text(d, FEATURE_COLS)
d["label"] = d[LABEL_COL].astype(str).str.strip()

# remove empty strings and textual nan/none
d = d[(d["text"].str.strip() != "") & (d["label"].str.strip() != "")]
d = d[~d["label"].str.lower().isin(["nan", "none"])].reset_index(drop=True)

if len(d) < 30 or d["label"].nunique() < 2:
    raise ValueError(f"Too few samples/classes after cleaning: n={len(d)}, k={d['label'].nunique()}")

d["label"] = bundle_rare_classes(d["label"], MIN_SAMPLES_PER_CLASS, RARE_CLASS_LABEL)
d = drop_split_impossible_singletons(d, "label")

class_list = sorted(d["label"].unique().tolist())
X_all = d["text"].tolist()
y_all = d["label"].tolist()

print(f"\nTask: {TASK_NAME}")
print(f"Samples (cleaned): {len(d)} | Classes: {len(class_list)}")
print(f"Features: {FEATURE_COLS} | Label: {LABEL_COL}")
print(f"Model: {BASE_MODEL}")
print(f"Config: iters={NUM_ITERATIONS}, epochs={NUM_EPOCHS}, bs={BATCH_SIZE}, lr={LEARNING_RATE}")

min_count = pd.Series(y_all).value_counts().min()
folds_eff = min(CV_FOLDS_MAX, int(min_count))

N = len(y_all)
K = len(class_list)
oof_pred = np.empty(N, dtype=object)
oof_prob = np.full((N, K), np.nan, dtype=float)
fold_rows = []

X_arr = np.asarray(X_all, dtype=object)
y_arr = np.asarray(y_all, dtype=object)

if folds_eff < 2:
    print(f"\nCV not feasible here (folds_eff={folds_eff}). Using 20% holdout split.")
    try:
        X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
            X_arr, y_arr, np.arange(N),
            test_size=0.2, random_state=SPLIT_SEED, stratify=y_arr
        )
    except Exception:
        X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
            X_arr, y_arr, np.arange(N),
            test_size=0.2, random_state=SPLIT_SEED
        )

    pred_te, prob_te, tsec = train_setfit_one_fold(
        X_train=X_tr.tolist(),
        y_train=y_tr.tolist(),
        X_eval=X_te.tolist(),
        class_list=class_list,
        seed=SPLIT_SEED,
    )

    oof_pred[idx_te] = pred_te
    prob_te_np = to_numpy_proba(prob_te)
    if prob_te_np is not None and prob_te_np.shape == (len(idx_te), K):
        oof_prob[idx_te, :] = prob_te_np

    m = compute_macro_metrics(y_te.tolist(), pred_te.tolist(), class_list)
    fold_rows.append({"fold": 1, "train_time_sec": tsec, **m})

    eval_mask = np.zeros(N, dtype=bool)
    eval_mask[idx_te] = True
    y_eval_true = y_arr[eval_mask].tolist()
    y_eval_pred = oof_pred[eval_mask].tolist()
    y_eval_prob = oof_prob[eval_mask, :] if not np.isnan(oof_prob[eval_mask, :]).any() else None

else:
    print(f"\nStarting StratifiedKFold CV with folds_eff={folds_eff}")
    skf = StratifiedKFold(n_splits=folds_eff, shuffle=True, random_state=SPLIT_SEED)

    for fold, (tr_idx, te_idx) in enumerate(skf.split(X_arr, y_arr), start=1):
        X_tr = X_arr[tr_idx].tolist()
        y_tr = y_arr[tr_idx].tolist()
        X_te = X_arr[te_idx].tolist()
        y_te = y_arr[te_idx].tolist()

        pred_te, prob_te, tsec = train_setfit_one_fold(
            X_train=X_tr,
            y_train=y_tr,
            X_eval=X_te,
            class_list=class_list,
            seed=SPLIT_SEED + fold,
        )

        oof_pred[te_idx] = pred_te

        prob_te_np = to_numpy_proba(prob_te)
        if prob_te_np is not None and prob_te_np.shape == (len(te_idx), K):
            oof_prob[te_idx, :] = prob_te_np

        m = compute_macro_metrics(y_te, pred_te.tolist(), class_list)
        fold_rows.append({"fold": fold, "train_time_sec": tsec, **m})

    y_eval_true = y_arr.tolist()
    y_eval_pred = oof_pred.tolist()
    y_eval_prob = None if np.isnan(oof_prob).any() else oof_prob

fold_df = pd.DataFrame(fold_rows)
print("\nResults per fold:")
print(fold_df[["fold", "accuracy", "precision_macro", "recall_macro", "f1_macro", "train_time_sec"]].to_string(index=False))

print("\nMean ± Std:")
print(
    f"Accuracy:  {fold_df['accuracy'].mean():.4f} ± {fold_df['accuracy'].std(ddof=0):.4f}\n"
    f"Precision: {fold_df['precision_macro'].mean():.4f} ± {fold_df['precision_macro'].std(ddof=0):.4f}\n"
    f"Recall:    {fold_df['recall_macro'].mean():.4f} ± {fold_df['recall_macro'].std(ddof=0):.4f}\n"
    f"F1 (macro): {fold_df['f1_macro'].mean():.4f} ± {fold_df['f1_macro'].std(ddof=0):.4f}"
)

fold_csv = os.path.join(OUT_DIR, f"{TASK_NAME}_folds.csv")
fold_df.to_csv(fold_csv, index=False)
print(f"\nFold table saved: {fold_csv}")

print("\nClassification Report (OOF):")
report_txt = classification_report(y_eval_true, y_eval_pred, labels=class_list, zero_division=0)
print(report_txt)

report_path = os.path.join(OUT_DIR, f"{TASK_NAME}_classification_report.txt")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(report_txt)
print(f"Report saved: {report_path}")

report_dict = classification_report(
    y_eval_true, y_eval_pred, labels=class_list, output_dict=True, zero_division=0
)
report_df = pd.DataFrame(report_dict).transpose()
report_csv = os.path.join(OUT_DIR, f"{TASK_NAME}_classification_report.csv")
report_df.to_csv(report_csv, index=True)
print(f"Report (CSV) saved: {report_csv}")

cm_norm = confusion_matrix(y_eval_true, y_eval_pred, labels=class_list, normalize="true")
cm_png = os.path.join(OUT_DIR, f"{TASK_NAME}_confusion_matrix_normalized.png")
plot_and_save_confusion_matrix(
    cm_norm=cm_norm,
    labels=class_list,
    title=f"Confusion Matrix (normalized) — {TASK_NAME}",
    out_png=cm_png,
)
print(f"Confusion matrix saved: {cm_png}")

plot_roc_pr_micro(
    task_name=f"{TASK_NAME} (OOF)",
    y_true=y_eval_true,
    y_prob=y_eval_prob,
    class_list=class_list,
)

print(f"\nFinal training on all labeled data for '{TASK_NAME}' ...")

set_all_seeds(FINAL_TRAIN_SEED)
train_ds_all = Dataset.from_dict({"text": X_all, "label": y_all})

model_final = SetFitModel.from_pretrained(
    BASE_MODEL,
    labels=class_list,
    head_params={"class_weight": "balanced", "max_iter": 2000},
)

trainer_final = SetFitTrainer(
    model=model_final,
    train_dataset=train_ds_all,
    column_mapping={"text": "text", "label": "label"},
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    num_iterations=NUM_ITERATIONS,
    learning_rate=LEARNING_RATE,
    seed=FINAL_TRAIN_SEED,
)

t0 = time.time()
trainer_final.train()
print(f"Final training finished after {time.time() - t0:.1f} seconds")

X_full = build_text(df_full, FEATURE_COLS).tolist()
preds_full = model_final.predict(X_full)

pred_col = f"{LABEL_COL}_pred"
df_full[pred_col] = np.asarray(preds_full, dtype=object)

full_out = os.path.join(OUT_DIR, f"FULL_60000_with_{TASK_NAME}_predictions.xlsx")
df_full.to_excel(full_out, index=False)

print(f"\nFull predictions saved: {full_out}")
print(f"Total runtime: {time.time() - t_all:.1f} seconds")

# Openninng_Hours

In [None]:
# Loading libraries
import os
import random
import time
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from datasets import Dataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
)
from sklearn.preprocessing import label_binarize

from setfit import SetFitModel, SetFitTrainer


# disable wandb prompts and tokenizer warnings
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


LABELED_PATH = "/content/Sample_500_stratified.xlsx"
FULL_DATA_PATH = "/content/Restaurants_Data.xlsx"
OUT_DIR = "/content/results_Opening_hours_final"
os.makedirs(OUT_DIR, exist_ok=True)


BASE_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

NUM_ITERATIONS = 20
NUM_EPOCHS = 3
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

CV_FOLDS_MAX = 3
SPLIT_SEED = 42
FINAL_TRAIN_SEED = 42

MIN_SAMPLES_PER_CLASS = 2
RARE_CLASS_LABEL = "Other / very rare"

TASK_NAME = "opening_Hours"
LABEL_COL = "opening_class_label"
FEATURE_COLS = ["opening_hours"]


def set_all_seeds(seed: int) -> None:
    # Ensure reproducibility across random, numpy, and torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_text(df: pd.DataFrame, feature_cols: List[str]) -> pd.Series:
    # Concatenate selected feature columns into one text string
    X = df[feature_cols].copy()
    for c in feature_cols:
        X[c] = X[c].fillna("").astype(str).str.strip()
    return (
        X.agg(" [SEP] ".join, axis=1)
        .astype(str)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )


def bundle_rare_classes(y: pd.Series, min_count: int, rare_label: str) -> pd.Series:
    # Merge very small classes into a single fallback label
    vc = y.value_counts(dropna=False)
    rare = vc[vc < min_count].index
    if len(rare) == 0:
        return y
    return y.where(~y.isin(rare), other=rare_label)


def drop_split_impossible_singletons(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    # StratifiedKFold requires at least 2 samples per class
    
    vc = df[label_col].value_counts()
    bad = vc[vc < 2]
    if len(bad) == 0:
        return df

    print("\nThere are classes with fewer than 2 samples. These rows will be removed for CV/training:")
    print(bad.to_string())
    keep_mask = ~df[label_col].isin(bad.index)
    dropped = int((~keep_mask).sum())
    print(f"Removed rows: {dropped}\n")
    return df.loc[keep_mask].reset_index(drop=True)


def compute_macro_metrics(y_true: List[str], y_pred: List[str], labels: List[str]) -> Dict[str, float]:
    # Compute macro-averaged classification metrics
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    return {
        "accuracy": float(acc),
        "precision_macro": float(p),
        "recall_macro": float(r),
        "f1_macro": float(f1),
    }


def to_numpy_proba(x) -> Optional[np.ndarray]:
    # Safely convert predict_proba output to numpy
    if x is None:
        return None
    if isinstance(x, torch.Tensor):
        return x.detach().cpu().numpy()
    return np.asarray(x)


def train_setfit_one_fold(
    X_train: List[str],
    y_train: List[str],
    X_eval: List[str],
    class_list: List[str],
    seed: int,
) -> Tuple[np.ndarray, Optional[np.ndarray], float]:

    set_all_seeds(seed)

    train_ds = Dataset.from_dict({"text": X_train, "label": y_train})

    model = SetFitModel.from_pretrained(
        BASE_MODEL,
        labels=class_list,
        head_params={"class_weight": "balanced", "max_iter": 2000},
    )

    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_ds,
        column_mapping={"text": "text", "label": "label"},
        batch_size=BATCH_SIZE,
        num_epochs=NUM_EPOCHS,
        num_iterations=NUM_ITERATIONS,
        learning_rate=LEARNING_RATE,
        seed=seed,
    )

    t0 = time.time()
    trainer.train()
    train_time = time.time() - t0

    y_pred = np.asarray(model.predict(X_eval), dtype=object)

    y_prob = None
    try:
        y_prob = to_numpy_proba(model.predict_proba(X_eval))
    except Exception:
        y_prob = None

    del trainer, model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return y_pred, y_prob, float(train_time)



# Main Execution


t_all = time.time()

df_labeled = pd.read_excel(LABELED_PATH)
df_labeled.columns = df_labeled.columns.astype(str).str.strip()

df_full = pd.read_excel(FULL_DATA_PATH)
df_full.columns = df_full.columns.astype(str).str.strip()

missing_labeled = [c for c in ([LABEL_COL] + FEATURE_COLS) if c not in df_labeled.columns]
if missing_labeled:
    raise ValueError(f"Missing columns in labeled file: {missing_labeled}")

missing_full = [c for c in FEATURE_COLS if c not in df_full.columns]
if missing_full:
    raise ValueError(f"Missing feature columns in full file: {missing_full}")

d = df_labeled[FEATURE_COLS + [LABEL_COL]].copy()
d[LABEL_COL] = d[LABEL_COL].astype(str).str.strip()
d = d.dropna(subset=[LABEL_COL])

d["text"] = build_text(d, FEATURE_COLS)
d["label"] = d[LABEL_COL].astype(str).str.strip()

# Remove empty strings and textual "nan"/"none"
d = d[(d["text"].str.strip() != "") & (d["label"].str.strip() != "")]
d = d[~d["label"].str.lower().isin(["nan", "none"])].reset_index(drop=True)

if len(d) < 30 or d["label"].nunique() < 2:
    raise ValueError(f"Too few samples/classes after cleaning: n={len(d)}, k={d['label'].nunique()}")

d["label"] = bundle_rare_classes(d["label"], MIN_SAMPLES_PER_CLASS, RARE_CLASS_LABEL)
d = drop_split_impossible_singletons(d, "label")

class_list = sorted(d["label"].unique().tolist())
X_all = d["text"].tolist()
y_all = d["label"].tolist()

print(f"\nTask: {TASK_NAME}")
print(f"Samples (cleaned): {len(d)} | Classes: {len(class_list)}")
print(f"Features: {FEATURE_COLS} | Label: {LABEL_COL}")
print(f"Model: {BASE_MODEL}")
print(f"Config: iters={NUM_ITERATIONS}, epochs={NUM_EPOCHS}, bs={BATCH_SIZE}, lr={LEARNING_RATE}")
print(f"Max CV folds: {CV_FOLDS_MAX}")

print(f"\nFinal training on all labeled data for '{TASK_NAME}' ...")

set_all_seeds(FINAL_TRAIN_SEED)
train_ds_all = Dataset.from_dict({"text": X_all, "label": y_all})

model_final = SetFitModel.from_pretrained(
    BASE_MODEL,
    labels=class_list,
    head_params={"class_weight": "balanced", "max_iter": 2000},
)

trainer_final = SetFitTrainer(
    model=model_final,
    train_dataset=train_ds_all,
    column_mapping={"text": "text", "label": "label"},
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    num_iterations=NUM_ITERATIONS,
    learning_rate=LEARNING_RATE,
    seed=FINAL_TRAIN_SEED,
)

t0 = time.time()
trainer_final.train()
print(f"Final training finished after {time.time() - t0:.1f} seconds")

X_full = build_text(df_full, FEATURE_COLS).tolist()
preds_full = model_final.predict(X_full)

pred_col = f"{LABEL_COL}_pred"
df_full[pred_col] = np.asarray(preds_full, dtype=object)

full_out = os.path.join(OUT_DIR, "FULL_60000_with_opening_class_predictions.xlsx")
df_full.to_excel(full_out, index=False)

print(f"\nFull predictions saved:\n- {full_out}")
print(f"Total runtime: {time.time() - t_all:.1f} seconds")

# Services

In [None]:
# Loading libraries
import os
import random
import time
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from datasets import Dataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
)
from sklearn.preprocessing import label_binarize

from setfit import SetFitModel, SetFitTrainer


# Reduce notebook noise (disable W&B + tokenizer parallelism warnings)
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Input / output paths
LABELED_PATH = "/content/Sample_500_stratified.xlsx"
FULL_DATA_PATH = "/content/Restaurants_Data.xlsx"
OUT_DIR = "/content/results_services_label_final"
os.makedirs(OUT_DIR, exist_ok=True)


# SetFit configuration
BASE_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

NUM_ITERATIONS = 20
NUM_EPOCHS = 2
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

# CV / split settings
CV_FOLDS_MAX = 3
SPLIT_SEED = 42
FINAL_TRAIN_SEED = 42

# Rare-class handling
MIN_SAMPLES_PER_CLASS = 2
RARE_CLASS_LABEL = "Other / very rare"

# Task definition
TASK_NAME = "Services_Label"
LABEL_COL = "Services_Label"
FEATURE_COLS = ["services"]


def set_all_seeds(seed: int) -> None:
    # Reproducibility across python, numpy and torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_text(df: pd.DataFrame, feature_cols: List[str]) -> pd.Series:
    # Turn the selected feature columns into a single model input string
    X = df[feature_cols].copy()
    for c in feature_cols:
        X[c] = X[c].fillna("").astype(str).str.strip()

    return (
        X.agg(" [SEP] ".join, axis=1)
        .astype(str)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )


def bundle_rare_classes(y: pd.Series, min_count: int, rare_label: str) -> pd.Series:
    # Collapse very small classes into a shared "rare" label
    vc = y.value_counts(dropna=False)
    rare = vc[vc < min_count].index
    if len(rare) == 0:
        return y
    return y.where(~y.isin(rare), other=rare_label)


def drop_split_impossible_singletons(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    # StratifiedKFold requires at least 2 samples per class; drop remaining singletons if needed
    vc = df[label_col].value_counts()
    bad = vc[vc < 2]
    if len(bad) == 0:
        return df

    print("\nThere are classes with fewer than 2 samples. These rows must be removed, otherwise CV will fail:")
    print(bad.to_string())
    keep_mask = ~df[label_col].isin(bad.index)
    dropped = int((~keep_mask).sum())
    print(f"Removed rows: {dropped}\n")
    return df.loc[keep_mask].reset_index(drop=True)


def compute_macro_metrics(y_true: List[str], y_pred: List[str], labels: List[str]) -> Dict[str, float]:
    # Macro metrics so small classes count equally
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    return {
        "accuracy": float(acc),
        "precision_macro": float(p),
        "recall_macro": float(r),
        "f1_macro": float(f1),
    }


def to_numpy_proba(x) -> Optional[np.ndarray]:
    # Helper to safely move proba outputs to numpy (handles torch tensors)
    if x is None:
        return None
    if isinstance(x, torch.Tensor):
        return x.detach().cpu().numpy()
    return np.asarray(x)


def train_setfit_one_fold(
    X_train: List[str],
    y_train: List[str],
    X_eval: List[str],
    class_list: List[str],
    seed: int,
) -> Tuple[np.ndarray, Optional[np.ndarray], float]:
    # Train SetFit once and return predictions, probabilities (if available), and training time
    set_all_seeds(seed)

    train_ds = Dataset.from_dict({"text": X_train, "label": y_train})

    model = SetFitModel.from_pretrained(
        BASE_MODEL,
        labels=class_list,
        head_params={"class_weight": "balanced", "max_iter": 2000},
    )

    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_ds,
        column_mapping={"text": "text", "label": "label"},
        batch_size=BATCH_SIZE,
        num_epochs=NUM_EPOCHS,
        num_iterations=NUM_ITERATIONS,
        learning_rate=LEARNING_RATE,
        seed=seed,
    )

    t0 = time.time()
    trainer.train()
    train_time = time.time() - t0

    y_pred = np.asarray(model.predict(X_eval), dtype=object)

    y_prob = None
    try:
        y_prob = to_numpy_proba(model.predict_proba(X_eval))
    except Exception:
        y_prob = None

    del trainer, model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return y_pred, y_prob, float(train_time)


def plot_and_save_confusion_matrix(cm_norm: np.ndarray, labels: List[str], title: str, out_png: str) -> None:
    # Normalized confusion matrix for interpretability (rows sum to 1)
    plt.figure(figsize=(10, 8))
    plt.imshow(cm_norm, aspect="auto")
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=45, ha="right")
    plt.yticks(range(len(labels)), labels)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")

    for i in range(cm_norm.shape[0]):
        for j in range(cm_norm.shape[1]):
            plt.text(j, i, f"{cm_norm[i, j]:.2f}", ha="center", va="center")

    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.show()


def plot_roc_pr_micro(task_name: str, y_true: List[str], y_prob: Optional[np.ndarray], class_list: List[str]) -> None:
    # Micro-average ROC/PR over all classes (only if predict_proba is available)
    y_prob = to_numpy_proba(y_prob)
    if y_prob is None:
        print(f"{task_name}: no probabilities -> ROC/PR skipped.")
        return

    y_true_arr = np.asarray(y_true, dtype=object)
    y_true_bin = label_binarize(y_true_arr, classes=class_list)

    fpr, tpr, _ = roc_curve(y_true_bin.ravel(), y_prob.ravel())
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC (micro) — {task_name} (AUC={roc_auc:.3f})")
    roc_png = os.path.join(OUT_DIR, f"{TASK_NAME}_roc_micro.png")
    plt.tight_layout()
    plt.savefig(roc_png, dpi=200)
    plt.show()

    prec, rec, _ = precision_recall_curve(y_true_bin.ravel(), y_prob.ravel())
    ap = average_precision_score(y_true_bin, y_prob, average="micro")

    plt.figure()
    plt.plot(rec, prec)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"PR (micro) — {task_name} (AP={ap:.3f})")
    pr_png = os.path.join(OUT_DIR, f"{TASK_NAME}_pr_micro.png")
    plt.tight_layout()
    plt.savefig(pr_png, dpi=200)
    plt.show()

    print(f"{task_name}: ROC-AUC(micro)={roc_auc:.4f} | AP(micro)={ap:.4f}")
    print(f"Plots saved:\n- {roc_png}\n- {pr_png}")


t_all = time.time()

# Load labeled training sample + full dataset (to run predictions later)
df_labeled = pd.read_excel(LABELED_PATH)
df_labeled.columns = df_labeled.columns.astype(str).str.strip()

df_full = pd.read_excel(FULL_DATA_PATH)
df_full.columns = df_full.columns.astype(str).str.strip()

# Basic schema validation early (fail fast if something is missing)
missing_labeled = [c for c in ([LABEL_COL] + FEATURE_COLS) if c not in df_labeled.columns]
if missing_labeled:
    raise ValueError(f"Missing columns in labeled file: {missing_labeled}")

missing_full = [c for c in FEATURE_COLS if c not in df_full.columns]
if missing_full:
    raise ValueError(f"Missing feature columns in full file: {missing_full}")

# Keep only the relevant columns and clean up label/text fields
d = df_labeled[FEATURE_COLS + [LABEL_COL]].copy()
d[LABEL_COL] = d[LABEL_COL].astype(str).str.strip()
d = d.dropna(subset=[LABEL_COL])

d["text"] = build_text(d, FEATURE_COLS)
d["label"] = d[LABEL_COL].astype(str).str.strip()

# Remove empty strings and textual "nan/none"
d = d[(d["text"].str.strip() != "") & (d["label"].str.strip() != "")]
d = d[~d["label"].str.lower().isin(["nan", "none"])].reset_index(drop=True)

if len(d) < 30 or d["label"].nunique() < 2:
    raise ValueError(f"Too few samples/classes after cleaning: n={len(d)}, k={d['label'].nunique()}")

# Bundle rare classes to stabilize splitting and training
d["label"] = bundle_rare_classes(d["label"], MIN_SAMPLES_PER_CLASS, RARE_CLASS_LABEL)

# Remove any remaining singletons (stratified CV would fail otherwise)
d = drop_split_impossible_singletons(d, "label")

class_list = sorted(d["label"].unique().tolist())
X_all = d["text"].tolist()
y_all = d["label"].tolist()

print(f"\nTask: {TASK_NAME}")
print(f"Samples (cleaned): {len(d)} | Classes: {len(class_list)}")
print(f"Features: {FEATURE_COLS} | Label: {LABEL_COL}")
print(f"Model: {BASE_MODEL}")
print(f"Config: iters={NUM_ITERATIONS}, epochs={NUM_EPOCHS}, bs={BATCH_SIZE}, lr={LEARNING_RATE}")
print(f"Max CV folds: {CV_FOLDS_MAX}")

min_count = pd.Series(y_all).value_counts().min()
folds_eff = min(CV_FOLDS_MAX, int(min_count))

N = len(y_all)
K = len(class_list)
oof_pred = np.empty(N, dtype=object)
oof_prob = np.full((N, K), np.nan, dtype=float)
fold_rows = []

X_arr = np.asarray(X_all, dtype=object)
y_arr = np.asarray(y_all, dtype=object)

# If we cannot do meaningful CV, use a simple holdout split; otherwise run StratifiedKFold
if folds_eff < 2:
    print(f"\nCV is not meaningful here (folds_eff={folds_eff}). Using a 20% holdout split.")
    try:
        X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
            X_arr, y_arr, np.arange(N),
            test_size=0.2, random_state=SPLIT_SEED, stratify=y_arr
        )
    except Exception:
        X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
            X_arr, y_arr, np.arange(N),
            test_size=0.2, random_state=SPLIT_SEED
        )

    pred_te, prob_te, tsec = train_setfit_one_fold(
        X_train=X_tr.tolist(),
        y_train=y_tr.tolist(),
        X_eval=X_te.tolist(),
        class_list=class_list,
        seed=SPLIT_SEED,
    )

    oof_pred[idx_te] = pred_te
    prob_te_np = to_numpy_proba(prob_te)
    if prob_te_np is not None and prob_te_np.shape == (len(idx_te), K):
        oof_prob[idx_te, :] = prob_te_np

    m = compute_macro_metrics(y_te.tolist(), pred_te.tolist(), class_list)
    fold_rows.append({"fold": 1, "train_time_sec": tsec, **m})

    eval_mask = np.zeros(N, dtype=bool)
    eval_mask[idx_te] = True
    y_eval_true = y_arr[eval_mask].tolist()
    y_eval_pred = oof_pred[eval_mask].tolist()
    y_eval_prob = oof_prob[eval_mask, :] if not np.isnan(oof_prob[eval_mask, :]).any() else None

else:
    print(f"\nStarting StratifiedKFold CV with folds_eff={folds_eff}")
    skf = StratifiedKFold(n_splits=folds_eff, shuffle=True, random_state=SPLIT_SEED)

    for fold, (tr_idx, te_idx) in enumerate(skf.split(X_arr, y_arr), start=1):
        X_tr = X_arr[tr_idx].tolist()
        y_tr = y_arr[tr_idx].tolist()
        X_te = X_arr[te_idx].tolist()
        y_te = y_arr[te_idx].tolist()

        pred_te, prob_te, tsec = train_setfit_one_fold(
            X_train=X_tr,
            y_train=y_tr,
            X_eval=X_te,
            class_list=class_list,
            seed=SPLIT_SEED + fold,
        )

        oof_pred[te_idx] = pred_te

        prob_te_np = to_numpy_proba(prob_te)
        if prob_te_np is not None and prob_te_np.shape == (len(te_idx), K):
            oof_prob[te_idx, :] = prob_te_np

        m = compute_macro_metrics(y_te, pred_te.tolist(), class_list)
        fold_rows.append({"fold": fold, "train_time_sec": tsec, **m})

    y_eval_true = y_arr.tolist()
    y_eval_pred = oof_pred.tolist()
    y_eval_prob = None if np.isnan(oof_prob).any() else oof_prob

fold_df = pd.DataFrame(fold_rows)
print("\nResults per fold:")
print(fold_df[["fold", "accuracy", "precision_macro", "recall_macro", "f1_macro", "train_time_sec"]].to_string(index=False))

print("\nMean ± Std:")
print(
    f"Accuracy:  {fold_df['accuracy'].mean():.4f} ± {fold_df['accuracy'].std(ddof=0):.4f}\n"
    f"Precision: {fold_df['precision_macro'].mean():.4f} ± {fold_df['precision_macro'].std(ddof=0):.4f}\n"
    f"Recall:    {fold_df['recall_macro'].mean():.4f} ± {fold_df['recall_macro'].std(ddof=0):.4f}\n"
    f"F1 (macro): {fold_df['f1_macro'].mean():.4f} ± {fold_df['f1_macro'].std(ddof=0):.4f}"
)

fold_csv = os.path.join(OUT_DIR, f"{TASK_NAME}_folds.csv")
fold_df.to_csv(fold_csv, index=False)
print(f"\nFold table saved: {fold_csv}")

print("\nClassification Report (OOF):")
report_txt = classification_report(y_eval_true, y_eval_pred, labels=class_list, zero_division=0)
print(report_txt)

report_path = os.path.join(OUT_DIR, f"{TASK_NAME}_classification_report.txt")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(report_txt)
print(f"Report saved: {report_path}")

report_dict = classification_report(
    y_eval_true, y_eval_pred, labels=class_list, output_dict=True, zero_division=0
)
report_df = pd.DataFrame(report_dict).transpose()
report_csv = os.path.join(OUT_DIR, f"{TASK_NAME}_classification_report.csv")
report_df.to_csv(report_csv, index=True)
print(f"Report (CSV) saved: {report_csv}")

cm_norm = confusion_matrix(y_eval_true, y_eval_pred, labels=class_list, normalize="true")
cm_png = os.path.join(OUT_DIR, f"{TASK_NAME}_confusion_matrix_normalized.png")
plot_and_save_confusion_matrix(
    cm_norm=cm_norm,
    labels=class_list,
    title=f"Confusion Matrix (normalized) — {TASK_NAME}",
    out_png=cm_png,
)
print(f"Confusion matrix saved: {cm_png}")

plot_roc_pr_micro(task_name=f"{TASK_NAME} (OOF)", y_true=y_eval_true, y_prob=y_eval_prob, class_list=class_list)

print(f"\nFinal training on all labeled data for '{TASK_NAME}' ...")

set_all_seeds(FINAL_TRAIN_SEED)
train_ds_all = Dataset.from_dict({"text": X_all, "label": y_all})

model_final = SetFitModel.from_pretrained(
    BASE_MODEL,
    labels=class_list,
    head_params={"class_weight": "balanced", "max_iter": 2000},
)

trainer_final = SetFitTrainer(
    model=model_final,
    train_dataset=train_ds_all,
    column_mapping={"text": "text", "label": "label"},
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    num_iterations=NUM_ITERATIONS,
    learning_rate=LEARNING_RATE,
    seed=FINAL_TRAIN_SEED,
)

t0 = time.time()
trainer_final.train()
print(f"Final training finished after {time.time() - t0:.1f} seconds")

# Predict the label for the full dataset and save to Excel
X_full = build_text(df_full, FEATURE_COLS).tolist()
preds_full = model_final.predict(X_full)

pred_col = f"{LABEL_COL}_pred"
df_full[pred_col] = np.asarray(preds_full, dtype=object)

full_out = os.path.join(OUT_DIR, "FULL_60000_with_services_predictions.xlsx")
df_full.to_excel(full_out, index=False)

print(f"\nFull predictions saved:\n- {full_out}")
print(f"Total runtime: {time.time() - t_all:.1f} seconds")

# Chain/Indep

In [None]:
# Loading libaries
import os
import random
import time
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from datasets import Dataset
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    average_precision_score,
)
from sklearn.preprocessing import label_binarize

from setfit import SetFitModel, SetFitTrainer


# Fewer side messages 
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"


# Input / output paths
LABELED_PATH = "/content/Sample_500_stratified.xlsx"
FULL_DATA_PATH = "/content/Restaurants_Data.xlsx"
OUT_DIR = "/content/results_Chain-Indep_final"
os.makedirs(OUT_DIR, exist_ok=True)


# Base model + training hyperparameters
BASE_MODEL = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

NUM_ITERATIONS = 20
NUM_EPOCHS = 2
BATCH_SIZE = 32
LEARNING_RATE = 2e-5

# Evaluation setup
CV_FOLDS_MAX = 3
SPLIT_SEED = 42
FINAL_TRAIN_SEED = 42

# Rare-class handling 
MIN_SAMPLES_PER_CLASS = 2
RARE_CLASS_LABEL = "Other / very rare"

# Task configuration
TASK_NAME = "Chain-Indep"
LABEL_COL = "Chain-Indep"
FEATURE_COLS = ["title"]


def set_all_seeds(seed: int) -> None:
    # Reproducibility across random, numpy and torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_text(df: pd.DataFrame, feature_cols: List[str]) -> pd.Series:
    # Build the single input text per row (SetFit expects one text column)
    X = df[feature_cols].copy()
    for c in feature_cols:
        X[c] = X[c].fillna("").astype(str).str.strip()
    return (
        X.agg(" [SEP] ".join, axis=1)
        .astype(str)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )


def bundle_rare_classes(y: pd.Series, min_count: int, rare_label: str) -> pd.Series:
    # Collapse very small classes into one shared label
    vc = y.value_counts(dropna=False)
    rare = vc[vc < min_count].index
    if len(rare) == 0:
        return y
    return y.where(~y.isin(rare), other=rare_label)


def drop_split_impossible_singletons(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    # Stratified splits require at least 2 samples per class
    vc = df[label_col].value_counts()
    bad = vc[vc < 2]
    if len(bad) == 0:
        return df

    print("\nNote: There are classes with fewer than 2 samples. Removing these rows for CV/training:")
    print(bad.to_string())
    keep_mask = ~df[label_col].isin(bad.index)
    dropped = int((~keep_mask).sum())
    print(f"Removed rows: {dropped}\n")
    return df.loc[keep_mask].reset_index(drop=True)


def compute_macro_metrics(y_true: List[str], y_pred: List[str], labels: List[str]) -> Dict[str, float]:
    # Macro-averaged metrics: each class contributes equally 
    acc = accuracy_score(y_true, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average="macro", zero_division=0
    )
    return {
        "accuracy": float(acc),
        "precision_macro": float(p),
        "recall_macro": float(r),
        "f1_macro": float(f1),
    }


def to_numpy_proba(x) -> Optional[np.ndarray]:
    # convert predict_proba output to numpy, handling torch tensors
    if x is None:
        return None
    if isinstance(x, torch.Tensor):
        return x.detach().cpu().numpy()
    return np.asarray(x)


def train_setfit_one_fold(
    X_train: List[str],
    y_train: List[str],
    X_eval: List[str],
    class_list: List[str],
    seed: int,
) -> Tuple[np.ndarray, Optional[np.ndarray], float]:
    # Train one SetFit model and evaluate on the given eval split
    set_all_seeds(seed)

    train_ds = Dataset.from_dict({"text": X_train, "label": y_train})

    model = SetFitModel.from_pretrained(
        BASE_MODEL,
        labels=class_list,
        head_params={"class_weight": "balanced", "max_iter": 2000},
    )

    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_ds,
        column_mapping={"text": "text", "label": "label"},
        batch_size=BATCH_SIZE,
        num_epochs=NUM_EPOCHS,
        num_iterations=NUM_ITERATIONS,
        learning_rate=LEARNING_RATE,
        seed=seed,
    )

    t0 = time.time()
    trainer.train()
    train_time = time.time() - t0

    y_pred = np.asarray(model.predict(X_eval), dtype=object)

    # Some heads support predict_proba; if it fails, we just skip ROC/PR later
    y_prob = None
    try:
        y_prob = to_numpy_proba(model.predict_proba(X_eval))
    except Exception:
        y_prob = None

    del trainer, model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return y_pred, y_prob, float(train_time)


def plot_and_save_confusion_matrix(cm_norm: np.ndarray, labels: List[str], title: str, out_png: str) -> None:
    # Normalized confusion matrix (rows sum to 1) for easier interpretation
    plt.figure(figsize=(10, 8))
    plt.imshow(cm_norm, aspect="auto")
    plt.colorbar()
    plt.xticks(range(len(labels)), labels, rotation=45, ha="right")
    plt.yticks(range(len(labels)), labels)
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")

    for i in range(cm_norm.shape[0]):
        for j in range(cm_norm.shape[1]):
            plt.text(j, i, f"{cm_norm[i, j]:.2f}", ha="center", va="center")

    plt.tight_layout()
    plt.savefig(out_png, dpi=200)
    plt.show()


def plot_roc_pr_micro(task_name: str, y_true: List[str], y_prob: Optional[np.ndarray], class_list: List[str]) -> None:
    # ROC/PR: binary uses standard curves; multiclass uses micro-averaging
    y_prob = to_numpy_proba(y_prob)
    if y_prob is None:
        print(f"{task_name}: no probabilities -> ROC/PR will be skipped.")
        return

    y_true_arr = np.asarray(y_true, dtype=object)
    n = len(y_true_arr)

    if y_prob.ndim == 1:
        y_prob = y_prob.reshape(-1, 1)

    if y_prob.shape[0] != n:
        print(f"{task_name}: shape mismatch (y_true={n}, y_prob={y_prob.shape[0]}) -> ROC/PR will be skipped.")
        return

    K = len(class_list)

    # Binary case 
    if K == 2:
        pos_label = class_list[1]
        y_true_bin = (y_true_arr == pos_label).astype(int)

        if y_prob.shape[1] == 2:
            y_score = y_prob[:, 1]
        else:
            y_score = y_prob.ravel()

        fpr, tpr, _ = roc_curve(y_true_bin, y_score)
        roc_auc = auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr, tpr)
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC — {task_name} (AUC={roc_auc:.3f})")
        roc_png = os.path.join(OUT_DIR, f"{TASK_NAME}_roc.png")
        plt.tight_layout()
        plt.savefig(roc_png, dpi=200)
        plt.show()

        prec, rec, _ = precision_recall_curve(y_true_bin, y_score)
        ap = average_precision_score(y_true_bin, y_score)

        plt.figure()
        plt.plot(rec, prec)
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"PR — {task_name} (AP={ap:.3f})")
        pr_png = os.path.join(OUT_DIR, f"{TASK_NAME}_pr.png")
        plt.tight_layout()
        plt.savefig(pr_png, dpi=200)
        plt.show()

        print(f"{task_name}: ROC-AUC={roc_auc:.4f} | AP={ap:.4f}")
        print(f"Plots saved:\n- {roc_png}\n- {pr_png}")
        return

    # Multiclass micro-average
    y_true_bin = label_binarize(y_true_arr, classes=class_list)

    if y_prob.shape[1] != y_true_bin.shape[1]:
        print(
            f"{task_name}: class mismatch (y_true_bin={y_true_bin.shape[1]}, y_prob={y_prob.shape[1]}) -> ROC/PR skipped."
        )
        return

    fpr, tpr, _ = roc_curve(y_true_bin.ravel(), y_prob.ravel())
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC (micro) — {task_name} (AUC={roc_auc:.3f})")
    roc_png = os.path.join(OUT_DIR, f"{TASK_NAME}_roc_micro.png")
    plt.tight_layout()
    plt.savefig(roc_png, dpi=200)
    plt.show()

    prec, rec, _ = precision_recall_curve(y_true_bin.ravel(), y_prob.ravel())
    ap = average_precision_score(y_true_bin, y_prob, average="micro")

    plt.figure()
    plt.plot(rec, prec)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(f"PR (micro) — {task_name} (AP={ap:.3f})")
    pr_png = os.path.join(OUT_DIR, f"{TASK_NAME}_pr_micro.png")
    plt.tight_layout()
    plt.savefig(pr_png, dpi=200)
    plt.show()

    print(f"{task_name}: ROC-AUC(micro)={roc_auc:.4f} | AP(micro)={ap:.4f}")
    print(f"Plots saved:\n- {roc_png}\n- {pr_png}")


t_all = time.time()

# Load labeled sample (for evaluation/training) and the full dataset (for final predictions)
df_labeled = pd.read_excel(LABELED_PATH)
df_labeled.columns = df_labeled.columns.astype(str).str.strip()

df_full = pd.read_excel(FULL_DATA_PATH)
df_full.columns = df_full.columns.astype(str).str.strip()

# Quick schema checks 
missing_labeled = [c for c in ([LABEL_COL] + FEATURE_COLS) if c not in df_labeled.columns]
if missing_labeled:
    raise ValueError(f"Missing columns in labeled file: {missing_labeled}")

missing_full = [c for c in FEATURE_COLS if c not in df_full.columns]
if missing_full:
    raise ValueError(f"Missing feature columns in full file: {missing_full}")

# Build modeling frame: text + label, then clean obvious junk
d = df_labeled[FEATURE_COLS + [LABEL_COL]].copy()
d[LABEL_COL] = d[LABEL_COL].astype(str).str.strip()
d = d.dropna(subset=[LABEL_COL])

d["text"] = build_text(d, FEATURE_COLS)
d["label"] = d[LABEL_COL].astype(str).str.strip()

d = d[(d["text"].str.strip() != "") & (d["label"].str.strip() != "")]
d = d[~d["label"].str.lower().isin(["nan", "none"])].reset_index(drop=True)

if len(d) < 30 or d["label"].nunique() < 2:
    raise ValueError(f"Too few samples/classes after cleaning: n={len(d)}, k={d['label'].nunique()}")

# Rare-class bundling improves stability for splits and macro metrics
d["label"] = bundle_rare_classes(d["label"], MIN_SAMPLES_PER_CLASS, RARE_CLASS_LABEL)
d = drop_split_impossible_singletons(d, "label")

class_list = sorted(d["label"].unique().tolist())
X_all = d["text"].tolist()
y_all = d["label"].tolist()

print(f"\nTask: {TASK_NAME}")
print(f"Samples (cleaned): {len(d)} | Classes: {len(class_list)}")
print(f"Features: {FEATURE_COLS} | Label: {LABEL_COL}")
print(f"Model: {BASE_MODEL}")
print(f"Config: iters={NUM_ITERATIONS}, epochs={NUM_EPOCHS}, bs={BATCH_SIZE}, lr={LEARNING_RATE}")
print(f"Max CV folds: {CV_FOLDS_MAX}")

min_count = pd.Series(y_all).value_counts().min()
folds_eff = min(CV_FOLDS_MAX, int(min_count))

N = len(y_all)
K = len(class_list)
oof_pred = np.empty(N, dtype=object)
oof_prob = np.full((N, K), np.nan, dtype=float)
fold_rows = []

X_arr = np.asarray(X_all, dtype=object)
y_arr = np.asarray(y_all, dtype=object)

# If CV isn't feasible, fall back to a simple holdout split
if folds_eff < 2:
    print(f"\nCV is not meaningful here (folds_eff={folds_eff}). Using a 20% holdout split instead.")
    try:
        X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
            X_arr, y_arr, np.arange(N),
            test_size=0.2, random_state=SPLIT_SEED, stratify=y_arr
        )
    except Exception:
        X_tr, X_te, y_tr, y_te, idx_tr, idx_te = train_test_split(
            X_arr, y_arr, np.arange(N),
            test_size=0.2, random_state=SPLIT_SEED
        )

    pred_te, prob_te, tsec = train_setfit_one_fold(
        X_train=X_tr.tolist(),
        y_train=y_tr.tolist(),
        X_eval=X_te.tolist(),
        class_list=class_list,
        seed=SPLIT_SEED,
    )

    oof_pred[idx_te] = pred_te
    prob_te_np = to_numpy_proba(prob_te)
    if prob_te_np is not None and prob_te_np.shape == (len(idx_te), K):
        oof_prob[idx_te, :] = prob_te_np

    m = compute_macro_metrics(y_te.tolist(), pred_te.tolist(), class_list)
    fold_rows.append({"fold": 1, "train_time_sec": tsec, **m})

    eval_mask = np.zeros(N, dtype=bool)
    eval_mask[idx_te] = True
    y_eval_true = y_arr[eval_mask].tolist()
    y_eval_pred = oof_pred[eval_mask].tolist()
    y_eval_prob = oof_prob[eval_mask, :] if not np.isnan(oof_prob[eval_mask, :]).any() else None

else:
    print(f"\nStarting StratifiedKFold CV with folds_eff={folds_eff}")
    skf = StratifiedKFold(n_splits=folds_eff, shuffle=True, random_state=SPLIT_SEED)

    for fold, (tr_idx, te_idx) in enumerate(skf.split(X_arr, y_arr), start=1):
        X_tr = X_arr[tr_idx].tolist()
        y_tr = y_arr[tr_idx].tolist()
        X_te = X_arr[te_idx].tolist()
        y_te = y_arr[te_idx].tolist()

        pred_te, prob_te, tsec = train_setfit_one_fold(
            X_train=X_tr,
            y_train=y_tr,
            X_eval=X_te,
            class_list=class_list,
            seed=SPLIT_SEED + fold,
        )

        oof_pred[te_idx] = pred_te

        prob_te_np = to_numpy_proba(prob_te)
        if prob_te_np is not None and prob_te_np.shape == (len(te_idx), K):
            oof_prob[te_idx, :] = prob_te_np

        m = compute_macro_metrics(y_te, pred_te.tolist(), class_list)
        fold_rows.append({"fold": fold, "train_time_sec": tsec, **m})

    y_eval_true = y_arr.tolist()
    y_eval_pred = oof_pred.tolist()
    y_eval_prob = None if np.isnan(oof_prob).any() else oof_prob

fold_df = pd.DataFrame(fold_rows)
print("\nResults per fold:")
print(fold_df[["fold", "accuracy", "precision_macro", "recall_macro", "f1_macro", "train_time_sec"]].to_string(index=False))

print("\nMean ± Std:")
print(
    f"Accuracy:  {fold_df['accuracy'].mean():.4f} ± {fold_df['accuracy'].std(ddof=0):.4f}\n"
    f"Precision: {fold_df['precision_macro'].mean():.4f} ± {fold_df['precision_macro'].std(ddof=0):.4f}\n"
    f"Recall:    {fold_df['recall_macro'].mean():.4f} ± {fold_df['recall_macro'].std(ddof=0):.4f}\n"
    f"F1 (macro): {fold_df['f1_macro'].mean():.4f} ± {fold_df['f1_macro'].std(ddof=0):.4f}"
)

fold_csv = os.path.join(OUT_DIR, f"{TASK_NAME}_folds.csv")
fold_df.to_csv(fold_csv, index=False)
print(f"\nFold table saved: {fold_csv}")

print("\nClassification Report (OOF):")
report_txt = classification_report(y_eval_true, y_eval_pred, labels=class_list, zero_division=0)
print(report_txt)

report_path = os.path.join(OUT_DIR, f"{TASK_NAME}_classification_report.txt")
with open(report_path, "w", encoding="utf-8") as f:
    f.write(report_txt)
print(f"Report saved: {report_path}")

report_dict = classification_report(
    y_eval_true, y_eval_pred, labels=class_list, output_dict=True, zero_division=0
)
report_df = pd.DataFrame(report_dict).transpose()
report_csv = os.path.join(OUT_DIR, f"{TASK_NAME}_classification_report.csv")
report_df.to_csv(report_csv, index=True)
print(f"Report (CSV) saved: {report_csv}")

cm_norm = confusion_matrix(y_eval_true, y_eval_pred, labels=class_list, normalize="true")
cm_png = os.path.join(OUT_DIR, f"{TASK_NAME}_confusion_matrix_normalized.png")
plot_and_save_confusion_matrix(
    cm_norm=cm_norm,
    labels=class_list,
    title=f"Confusion Matrix (normalized) — {TASK_NAME}",
    out_png=cm_png,
)
print(f"Confusion matrix saved: {cm_png}")

plot_roc_pr_micro(task_name=f"{TASK_NAME} (OOF)", y_true=y_eval_true, y_prob=y_eval_prob, class_list=class_list)

print(f"\nFinal training on all labeled data for '{TASK_NAME}' ...")

set_all_seeds(FINAL_TRAIN_SEED)
train_ds_all = Dataset.from_dict({"text": X_all, "label": y_all})

model_final = SetFitModel.from_pretrained(
    BASE_MODEL,
    labels=class_list,
    head_params={"class_weight": "balanced", "max_iter": 2000},
)

trainer_final = SetFitTrainer(
    model=model_final,
    train_dataset=train_ds_all,
    column_mapping={"text": "text", "label": "label"},
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    num_iterations=NUM_ITERATIONS,
    learning_rate=LEARNING_RATE,
    seed=FINAL_TRAIN_SEED,
)

t0 = time.time()
trainer_final.train()
print(f"Final training finished after {time.time() - t0:.1f} seconds")

# Run inference on the full dataset and save the enriched Excel file
X_full = build_text(df_full, FEATURE_COLS).tolist()
preds_full = model_final.predict(X_full)

pred_col = f"{LABEL_COL}_pred"
df_full[pred_col] = np.asarray(preds_full, dtype=object)

full_out = os.path.join(OUT_DIR, f"FULL_60000_with_{TASK_NAME}_predictions.xlsx")
df_full.to_excel(full_out, index=False)

print(f"\nFull predictions saved:\n- {full_out}")
print(f"Total runtime: {time.time() - t_all:.1f} seconds")