# 04 — Small-sample study

Étude de robustesse en faible effectif.

- Tailles évaluées: `800`, `400`, `200`, `100`, `50`
- Répétitions: 5 seeds
- Modèles: `LogReg L1`, `Linear SVM`, `Random Forest`
- Métriques suivies: `accuracy`, `macro F1`

In [None]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

project_root = Path.cwd().resolve()
if not (project_root / "src").exists():
    project_root = project_root.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.data_loader import load_dataset
from src.evaluation import compute_metrics
from src.models import make_linear_svm, make_logreg_l1, make_random_forest
from src.preprocessing import encode_labels
from src.visualization import save_figure

sns.set_theme(style="whitegrid")
fig_dir = project_root / "results" / "figures"
tables_dir = project_root / "results" / "tables"
fig_dir.mkdir(parents=True, exist_ok=True)
tables_dir.mkdir(parents=True, exist_ok=True)

In [None]:
ds = load_dataset(
    data_path=str(project_root / "data" / "raw" / "data.csv"),
    labels_path=str(project_root / "data" / "raw" / "labels.csv"),
)

X = ds.X
y_enc, label_encoder = encode_labels(ds.y)

sample_sizes = [800, 400, 200, 100, 50]
seeds = [0, 1, 2, 3, 4]

def run_one_experiment(X_sub, y_sub, seed):
    X_train, X_test, y_train, y_test = train_test_split(
        X_sub,
        y_sub,
        test_size=0.2,
        random_state=seed,
        stratify=y_sub,
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    models = {
        "logreg_l1": make_logreg_l1(seed=seed, c=1.0),
        "linear_svm": make_linear_svm(seed=seed, c=1.0),
        "random_forest": make_random_forest(seed=seed, n_estimators=60),
    }

    rows = []
    for model_name, model in models.items():
        if model_name == "random_forest":
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
        else:
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)

        metrics = compute_metrics(
            y_true=y_test,
            y_pred=y_pred,
            labels=list(range(len(label_encoder.classes_))),
            target_names=list(label_encoder.classes_),
        )
        rows.append(
            {
                "model": model_name,
                "accuracy": metrics["accuracy"],
                "macro_f1": metrics["macro_f1"],
                "n_train": len(y_train),
                "n_test": len(y_test),
            }
        )
    return rows

In [None]:
all_rows = []
n_classes = len(label_encoder.classes_)

for n in sample_sizes:
    n_eff = min(n, len(y_enc))
    for seed in seeds:
        if (len(y_enc) - n_eff) < n_classes:
            X_sub = X
            y_sub = y_enc
        else:
            X_sub, _, y_sub, _ = train_test_split(
                X,
                y_enc,
                train_size=n_eff,
                random_state=seed,
                stratify=y_enc,
            )

        exp_rows = run_one_experiment(X_sub, y_sub, seed)
        for r in exp_rows:
            r.update({"sample_size": n_eff, "seed": seed})
            all_rows.append(r)

results_raw_df = pd.DataFrame(all_rows)
summary_df = (
    results_raw_df.groupby(["model", "sample_size"], as_index=False)
    .agg(
        accuracy_mean=("accuracy", "mean"),
        accuracy_std=("accuracy", "std"),
        macro_f1_mean=("macro_f1", "mean"),
        macro_f1_std=("macro_f1", "std"),
    )
    .sort_values(["model", "sample_size"])
)

out_path = tables_dir / "small_sample_results.csv"
summary_df.to_csv(out_path, index=False)

print(summary_df)
print(f"Saved: {out_path}")

In [None]:
fig, ax = plt.subplots(figsize=(9, 5))
sns.lineplot(
    data=summary_df,
    x="sample_size",
    y="macro_f1_mean",
    hue="model",
    marker="o",
    ax=ax,
)
ax.set_title("Performance vs sample size (macro F1 mean)")
ax.set_xlabel("Sample size")
ax.set_ylabel("Macro F1 (mean across seeds)")
ax.set_xticks(sample_sizes)

perf_path = fig_dir / "04_performance_vs_n.png"
save_figure(fig, str(perf_path))
plt.show()
print(f"Saved: {perf_path}")

In [None]:
fig, ax = plt.subplots(figsize=(9, 5))
sns.lineplot(
    data=summary_df,
    x="sample_size",
    y="macro_f1_std",
    hue="model",
    marker="o",
    ax=ax,
)
ax.set_title("Variance vs sample size (macro F1 std)")
ax.set_xlabel("Sample size")
ax.set_ylabel("Macro F1 (std across seeds)")
ax.set_xticks(sample_sizes)

var_path = fig_dir / "04_variance_vs_n.png"
save_figure(fig, str(var_path))
plt.show()
print(f"Saved: {var_path}")

## Mini résumé

- L'expérience small-sample est répétée sur 5 seeds pour réduire l'effet d'un split unique.
- La courbe performance-vs-n met en évidence la dégradation quand la taille d'échantillon diminue.
- La courbe variance-vs-n quantifie l'instabilité en faible effectif.
- Les résultats agrégés sont sauvegardés dans `results/tables/small_sample_results.csv`.