# 03 — Baseline models

Ce notebook entraîne un premier baseline : **Logistic Regression (L2)**,
puis exporte les métriques et la matrice de confusion dans `results/`.

In [None]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

project_root = Path.cwd().resolve()
if not (project_root / "src").exists():
    project_root = project_root.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.data_loader import load_dataset
from src.preprocessing import encode_labels, make_splits, scale_train_test
from src.models import (
    make_gradient_boosting,
    make_linear_svm,
    make_logreg_l1,
    make_logreg_l2,
    make_random_forest,
)
from src.evaluation import compute_metrics
from src.visualization import save_figure

sns.set_theme(style="whitegrid")
fig_dir = project_root / "results" / "figures"
tables_dir = project_root / "results" / "tables"
fig_dir.mkdir(parents=True, exist_ok=True)
tables_dir.mkdir(parents=True, exist_ok=True)

In [None]:
ds = load_dataset(
    data_path=str(project_root / "data" / "raw" / "data.csv"),
    labels_path=str(project_root / "data" / "raw" / "labels.csv"),
)

X = ds.X
y_enc, label_encoder = encode_labels(ds.y)

X_train, X_test, y_train, y_test = make_splits(X, y_enc, test_size=0.2, seed=42)
X_train_scaled, X_test_scaled, scaler = scale_train_test(X_train, X_test)

print(f"Train shape: {X_train_scaled.shape} | Test shape: {X_test_scaled.shape}")
print(f"Classes: {list(label_encoder.classes_)}")

In [None]:
model = make_logreg_l2(seed=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

metrics = compute_metrics(
    y_true=y_test,
    y_pred=y_pred,
    labels=list(range(len(label_encoder.classes_))),
    target_names=list(label_encoder.classes_),
)

print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Macro F1: {metrics['macro_f1']:.4f}")
print(metrics["classification_report_text"])

In [None]:
summary_df = pd.DataFrame(
    [
        {
            "model": "logreg_l2",
            "accuracy": metrics["accuracy"],
            "macro_f1": metrics["macro_f1"],
            "n_train": X_train_scaled.shape[0],
            "n_test": X_test_scaled.shape[0],
            "n_features": X_train_scaled.shape[1],
            "seed": 42,
            "test_size": 0.2,
        }
    ]
)
summary_path = tables_dir / "03_logreg_l2_metrics.csv"
summary_df.to_csv(summary_path, index=False)

per_class_df = pd.DataFrame(metrics["per_class_report"]).T
per_class_path = tables_dir / "03_logreg_l2_per_class_report.csv"
per_class_df.to_csv(per_class_path)

print(f"Saved: {summary_path}")
print(f"Saved: {per_class_path}")

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))
cm_df = pd.DataFrame(
    metrics["confusion_matrix"],
    index=label_encoder.classes_,
    columns=label_encoder.classes_,
)

sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues", ax=ax)
ax.set_title("Logistic Regression (L2) - Confusion Matrix")
ax.set_xlabel("Predicted label")
ax.set_ylabel("True label")

cm_path = fig_dir / "03_logreg_l2_confusion_matrix.png"
save_figure(fig, str(cm_path))
plt.show()
print(f"Saved: {cm_path}")

## Logistic Regression (L1) + feature selection

On entraîne un modèle L1 (`solver="saga"`) pour induire de la sparsité,
puis on extrait les gènes sélectionnés (coefficients non nuls).

In [None]:
model_l1 = make_logreg_l1(seed=42, c=1.0)
model_l1.fit(X_train_scaled, y_train)
y_pred_l1 = model_l1.predict(X_test_scaled)

metrics_l1 = compute_metrics(
    y_true=y_test,
    y_pred=y_pred_l1,
    labels=list(range(len(label_encoder.classes_))),
    target_names=list(label_encoder.classes_),
)

print(f"L1 Accuracy: {metrics_l1['accuracy']:.4f}")
print(f"L1 Macro F1: {metrics_l1['macro_f1']:.4f}")

In [None]:
coef = model_l1.coef_  # shape: (n_classes, n_features)
feature_names = np.array(X.columns)

selected_mask = np.any(np.abs(coef) > 1e-12, axis=0)
selected_indices = np.where(selected_mask)[0]

max_abs_coef = np.max(np.abs(coef), axis=0)
coef_df = pd.DataFrame({
    "gene": feature_names,
    "selected": selected_mask,
    "max_abs_coef": max_abs_coef,
})
coef_df = coef_df.sort_values("max_abs_coef", ascending=False)

# Top 50 selected genes (global ranking across classes)
top_genes_l1 = coef_df[coef_df["selected"]].head(50).copy()
top_genes_path = tables_dir / "top_genes_l1.csv"
top_genes_l1.to_csv(top_genes_path, index=False)

print(f"Selected features: {int(selected_mask.sum())} / {coef.shape[1]}")
print(f"Saved: {top_genes_path}")

In [None]:
n_selected_total = int(selected_mask.sum())
selected_per_class = (np.abs(coef) > 1e-12).sum(axis=1)

plot_df = pd.DataFrame({
    "label": ["Total"] + list(label_encoder.classes_),
    "n_selected": [n_selected_total] + selected_per_class.tolist(),
})

fig, ax = plt.subplots(figsize=(8, 4))
sns.barplot(data=plot_df, x="label", y="n_selected", hue="label", dodge=False, legend=False, ax=ax)
ax.set_title("Number of features selected by L1 logistic regression")
ax.set_xlabel("Class (plus total)")
ax.set_ylabel("Number of selected features")
ax.tick_params(axis="x", rotation=30)

nb_features_plot_path = fig_dir / "nb_features_selected.png"
save_figure(fig, str(nb_features_plot_path))
plt.show()
print(f"Saved: {nb_features_plot_path}")

## Comparaison des baselines avec / sans PCA (95%)

Cette section compare les performances de **LogReg (L2)** et **Linear SVM**
sur les mêmes splits, avec features standardisées, puis après réduction PCA à 95% de variance.

In [None]:
pca_95 = PCA(n_components=0.95, random_state=42)
X_train_pca = pca_95.fit_transform(X_train_scaled)
X_test_pca = pca_95.transform(X_test_scaled)

model_factories = {
    "logreg_l2": lambda: make_logreg_l2(seed=42),
    "linear_svm": lambda: make_linear_svm(seed=42, c=1.0),
}

experiments = [
    ("without_pca", X_train_scaled, X_test_scaled, X_train_scaled.shape[1]),
    ("with_pca_95", X_train_pca, X_test_pca, X_train_pca.shape[1]),
]

rows = []
for setting, Xtr, Xte, n_features_used in experiments:
    for model_name, factory in model_factories.items():
        model_i = factory()
        model_i.fit(Xtr, y_train)
        y_pred_i = model_i.predict(Xte)

        m = compute_metrics(
            y_true=y_test,
            y_pred=y_pred_i,
            labels=list(range(len(label_encoder.classes_))),
            target_names=list(label_encoder.classes_),
        )
        rows.append(
            {
                "model": model_name,
                "setting": setting,
                "accuracy": m["accuracy"],
                "macro_f1": m["macro_f1"],
                "n_features_used": int(n_features_used),
                "seed": 42,
                "test_size": 0.2,
            }
        )

comparison_df = pd.DataFrame(rows).sort_values(["model", "setting"]).reset_index(drop=True)
comparison_path = tables_dir / "baseline_with_without_pca.csv"
comparison_df.to_csv(comparison_path, index=False)

print(comparison_df)
print(f"Saved: {comparison_path}")

## Validation croisée stratifiée (mean/std F1 macro)

Pour une évaluation plus robuste (niveau labo), on utilise `StratifiedKFold`
et on reporte la moyenne et l'écart-type du F1 macro.

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_model_factories = {
    "logreg_l2": lambda: make_logreg_l2(seed=42),
    "linear_svm": lambda: make_linear_svm(seed=42, c=1.0),
}

cv_settings = ["without_pca", "with_pca_95"]
cv_rows = []

for model_name, factory in cv_model_factories.items():
    for setting in cv_settings:
        fold_f1 = []
        for fold_idx, (train_idx, valid_idx) in enumerate(cv.split(X, y_enc), start=1):
            X_fold_train = X.iloc[train_idx]
            X_fold_valid = X.iloc[valid_idx]
            y_fold_train = y_enc[train_idx]
            y_fold_valid = y_enc[valid_idx]

            scaler_cv = StandardScaler()
            X_fold_train_scaled = scaler_cv.fit_transform(X_fold_train)
            X_fold_valid_scaled = scaler_cv.transform(X_fold_valid)

            if setting == "with_pca_95":
                pca_cv = PCA(n_components=0.95, random_state=42)
                X_fold_train_final = pca_cv.fit_transform(X_fold_train_scaled)
                X_fold_valid_final = pca_cv.transform(X_fold_valid_scaled)
                n_features_used = int(X_fold_train_final.shape[1])
            else:
                X_fold_train_final = X_fold_train_scaled
                X_fold_valid_final = X_fold_valid_scaled
                n_features_used = int(X_fold_train_final.shape[1])

            model_cv = factory()
            model_cv.fit(X_fold_train_final, y_fold_train)
            y_fold_pred = model_cv.predict(X_fold_valid_final)

            fold_macro_f1 = f1_score(y_fold_valid, y_fold_pred, average="macro")
            fold_f1.append(float(fold_macro_f1))

        cv_rows.append(
            {
                "model": model_name,
                "setting": setting,
                "cv_f1_macro_mean": float(np.mean(fold_f1)),
                "cv_f1_macro_std": float(np.std(fold_f1, ddof=1)),
                "n_splits": 5,
                "seed": 42,
                "n_features_used": n_features_used,
            }
        )

cv_results_df = pd.DataFrame(cv_rows).sort_values(["model", "setting"]).reset_index(drop=True)
cv_results_path = tables_dir / "cv_results.csv"
cv_results_df.to_csv(cv_results_path, index=False)

print(cv_results_df)
print(f"Saved: {cv_results_path}")

## Non-linear baselines (RF / Gradient Boosting)

Cette section apporte une contribution non-linéaire explicite en comparant
`Random Forest` et `Gradient Boosting` avec et sans standardisation.

In [None]:
non_linear_factories = {
    "random_forest": lambda: make_random_forest(seed=42, n_estimators=60),
    "gradient_boosting": lambda: make_gradient_boosting(seed=42),
}

non_linear_experiments = [
    ("raw_features", X_train, X_test),
    ("standardized_features", X_train_scaled, X_test_scaled),
]

non_linear_rows = []
for setting, Xtr, Xte in non_linear_experiments:
    for model_name, factory in non_linear_factories.items():
        model_nl = factory()
        model_nl.fit(Xtr, y_train)
        y_pred_nl = model_nl.predict(Xte)

        m_nl = compute_metrics(
            y_true=y_test,
            y_pred=y_pred_nl,
            labels=list(range(len(label_encoder.classes_))),
            target_names=list(label_encoder.classes_),
        )
        non_linear_rows.append(
            {
                "model": model_name,
                "feature_setting": setting,
                "accuracy": m_nl["accuracy"],
                "macro_f1": m_nl["macro_f1"],
                "n_features": Xtr.shape[1],
                "seed": 42,
            }
        )

non_linear_df = pd.DataFrame(non_linear_rows).sort_values(["model", "feature_setting"]).reset_index(drop=True)
non_linear_path = tables_dir / "non_linear_baseline_comparison.csv"
non_linear_df.to_csv(non_linear_path, index=False)

fig, ax = plt.subplots(figsize=(8, 4))
sns.barplot(data=non_linear_df, x="model", y="macro_f1", hue="feature_setting", ax=ax)
ax.set_title("Non-linear baseline comparison (macro F1)")
ax.set_xlabel("Model")
ax.set_ylabel("Macro F1")
plot_path = fig_dir / "03_non_linear_baseline_comparison.png"
save_figure(fig, str(plot_path))
plt.show()

print(non_linear_df)
print(f"Saved: {non_linear_path}")
print(f"Saved: {plot_path}")