# Dummy model

In [None]:
# Loading Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    roc_curve,
    precision_recall_curve,
    roc_auc_score,
    average_precision_score,
)
from sklearn.preprocessing import label_binarize


# basic config (paths + split settings)
DATA_PATH = "/content/Sample_500_stratified.xlsx"
RANDOM_STATE = 42
TEST_SIZE = 0.2
MIN_COUNT_PER_CLASS = 2
RARE_LABEL = "__RARE__"


def make_text_X(df: pd.DataFrame, feature_cols):
    # combine selected text columns into one string per row
    X = df[feature_cols].copy()
    for c in feature_cols:
        X[c] = X[c].fillna("").astype(str)
    return X.agg(" ".join, axis=1)


def squash_rare_classes(y: pd.Series, min_count=2, rare_label="__RARE__"):
    # collapse very small classes into a single fallback label
    y = y.astype(str).str.strip()
    vc = y.value_counts(dropna=False)
    rare = vc[vc < min_count].index
    if len(rare) == 0:
        return y
    return y.where(~y.isin(rare), other=rare_label)


def plot_binary_roc_pr_from_scores(y_true_bin: np.ndarray, scores: np.ndarray, titel: str):
    # ROC + PR curves for binary case
    try:
        fpr, tpr, _ = roc_curve(y_true_bin, scores)
        auc_val = roc_auc_score(y_true_bin, scores)
        plt.figure()
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1], linestyle="--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"{titel} ROC (AUC={auc_val:.3f})")
        plt.show()
    except Exception as e:
        print(f"{titel}: ROC übersprungen ({e})")

    try:
        prec, rec, _ = precision_recall_curve(y_true_bin, scores)
        ap = average_precision_score(y_true_bin, scores)
        plt.figure()
        plt.plot(rec, prec)
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"{titel} PR (AP={ap:.3f})")
        plt.show()
    except Exception as e:
        print(f"{titel}: PR übersprungen ({e})")


def plot_multiclass_micro_roc_pr_from_scores(y_true: np.ndarray, score_mat: np.ndarray, class_order, titel: str):
    # micro-averaged ROC + PR for multiclass setting
    try:
        present = np.unique(y_true)
        present = np.array([c for c in present if c in set(class_order)], dtype=object)

        if len(present) < 2:
            print(f"{titel}: Kurven übersprungen (<2 Klassen im Test)")
            return

        idx = [np.where(class_order == c)[0][0] for c in present]
        scores_sub = score_mat[:, idx]
        y_bin = label_binarize(y_true, classes=present)

        fpr, tpr, _ = roc_curve(y_bin.ravel(), scores_sub.ravel())
        auc_micro = roc_auc_score(y_bin, scores_sub, average="micro")
        auc_macro = roc_auc_score(y_bin, scores_sub, average="macro")

        plt.figure()
        plt.plot(fpr, tpr)
        plt.plot([0, 1], [0, 1], linestyle="--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"{titel} ROC (micro={auc_micro:.3f}, macro={auc_macro:.3f})")
        plt.show()

        prec, rec, _ = precision_recall_curve(y_bin.ravel(), scores_sub.ravel())
        ap_micro = average_precision_score(y_bin, scores_sub, average="micro")
        ap_macro = average_precision_score(y_bin, scores_sub, average="macro")

        plt.figure()
        plt.plot(rec, prec)
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"{titel} PR (micro={ap_micro:.3f}, macro={ap_macro:.3f})")
        plt.show()

    except Exception as e:
        print(f"{titel}: Multiclass-Kurven übersprungen ({e})")


def run_dummy_task(df: pd.DataFrame, feature_cols, target_col: str, task_name: str, plot_curves=True):
    # copy + minimal cleaning
    d = df.copy()
    d = d.dropna(subset=[target_col])
    d[target_col] = d[target_col].astype(str).str.strip()
    d = d[d[target_col] != ""]

    if len(d) < 10:
        print(f"{task_name}: zu wenige Zeilen (n={len(d)})")
        return

    # build text input and target
    X = make_text_X(d, feature_cols)
    y = d[target_col].astype(str)

    # reduce rare labels to stabilize evaluation
    y = squash_rare_classes(y, min_count=MIN_COUNT_PER_CLASS, rare_label=RARE_LABEL)

    if y.nunique() < 2:
        print(f"{task_name}: nur eine Klasse nach Bereinigung")
        return

    # stratified split if feasible
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
        )
    except Exception:
        # fallback without stratification
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
        )

    # majority baseline
    model = DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro", zero_division=0)

    print(task_name)
    print(f"n_train={len(y_train)}, n_test={len(y_test)}, n_classes={y.nunique()}")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f} (macro)")
    print(f"Recall:    {rec:.4f} (macro)")
    print(f"F1:        {f1:.4f} (macro)")

    if not plot_curves:
        return

    # probability-based curves 
    try:
        proba = model.predict_proba(X_test)
        class_order = model.classes_

        if len(class_order) == 2:
            # binary case
            pos_label = class_order[1]
            y_true_bin = (np.asarray(y_test) == pos_label).astype(int)
            plot_binary_roc_pr_from_scores(y_true_bin, proba[:, 1], task_name)
        else:
            # multiclass case
            plot_multiclass_micro_roc_pr_from_scores(np.asarray(y_test), proba, class_order, task_name)

    except Exception as e:
        print(f"{task_name}: Kurven übersprungen ({e})")


# load data
df = pd.read_excel(DATA_PATH)
print(f"Datei geladen: {DATA_PATH}")
print(f"Rows: {len(df)} | Cols: {len(df.columns)}")
display(df.head(3))


# baseline tasks for different label dimensions

run_dummy_task(
    df,
    feature_cols=["desc_1", "desc_2"],
    target_col="cuisine_region",
    task_name="DummyClassifier: cuisine_region",
    plot_curves=True,
)

run_dummy_task(
    df,
    feature_cols=["desc_1", "desc_2", "title"],
    target_col="concept_format",
    task_name="DummyClassifier: concept_format",
    plot_curves=True,
)

run_dummy_task(
    df,
    feature_cols=["opening_hours"],
    target_col="opening_class_label",
    task_name="DummyClassifier: opening_class_label",
    plot_curves=True,
)

run_dummy_task(
    df,
    feature_cols=["title"],
    target_col="Chain-Indep",
    task_name="DummyClassifier: Chain-Indep",
    plot_curves=True,
)

run_dummy_task(
    df,
    feature_cols=["services"],
    target_col="Services_Label",
    task_name="DummyClassifier: Services_Label",
    plot_curves=True,
)