# 02 — Dimensionality reduction

Ce notebook implémente une pipeline PCA pour :
- visualiser la variance expliquée (scree plot + cumul),
- projeter les échantillons en 2D colorés par classe,
- déterminer le nombre de composantes pour conserver 95% de variance.

In [None]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

project_root = Path.cwd().resolve()
if not (project_root / "src").exists():
    project_root = project_root.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.data_loader import load_dataset
from src.visualization import save_figure

sns.set_theme(style="whitegrid")
fig_dir = project_root / "results" / "figures"
tables_dir = project_root / "results" / "tables"
fig_dir.mkdir(parents=True, exist_ok=True)
tables_dir.mkdir(parents=True, exist_ok=True)

In [None]:
ds = load_dataset(
    data_path=str(project_root / "data" / "raw" / "data.csv"),
    labels_path=str(project_root / "data" / "raw" / "labels.csv"),
)
X = ds.X
y = ds.y

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"Scaled matrix shape: {X_scaled.shape}")
print(f"Number of classes: {y.nunique()}")

In [None]:
pca_full = PCA(random_state=42)
X_pca_full = pca_full.fit_transform(X_scaled)

explained = pca_full.explained_variance_ratio_
cumulative = np.cumsum(explained)
n_components_95 = int(np.searchsorted(cumulative, 0.95) + 1)

print(f"Components needed for 95% variance: {n_components_95}")

fig, axes = plt.subplots(1, 2, figsize=(13, 4))

axes[0].plot(np.arange(1, len(explained) + 1), explained, linewidth=1)
axes[0].set_title("PCA scree plot")
axes[0].set_xlabel("Principal component")
axes[0].set_ylabel("Explained variance ratio")

axes[1].plot(np.arange(1, len(cumulative) + 1), cumulative, linewidth=1)
axes[1].axhline(0.95, color="red", linestyle="--", label="95% variance")
axes[1].axvline(n_components_95, color="black", linestyle="--", label=f"n={n_components_95}")
axes[1].set_title("Cumulative explained variance")
axes[1].set_xlabel("Number of components")
axes[1].set_ylabel("Cumulative explained variance")
axes[1].legend()

save_figure(fig, str(fig_dir / "02_pca_scree_and_cumulative.png"))
plt.show()

In [None]:
pca_2d = PCA(n_components=2, random_state=42)
X_pca_2d = pca_2d.fit_transform(X_scaled)

plot_df = pd.DataFrame({
    "PC1": X_pca_2d[:, 0],
    "PC2": X_pca_2d[:, 1],
    "Class": y.values,
})

fig, ax = plt.subplots(figsize=(8, 6))
sns.scatterplot(
    data=plot_df,
    x="PC1",
    y="PC2",
    hue="Class",
    alpha=0.85,
    s=35,
    ax=ax,
)
ax.set_title("PCA 2D projection colored by class")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.legend(title="Class", bbox_to_anchor=(1.02, 1), loc="upper left")

save_figure(fig, str(fig_dir / "02_pca_2d_projection.png"))
plt.show()

In [None]:
pca_95 = PCA(n_components=0.95, random_state=42)
X_pca_95 = pca_95.fit_transform(X_scaled)

components_path = tables_dir / "pca_components_95.npy"
np.save(components_path, pca_95.components_)

reduced_path = tables_dir / "X_pca_95_shape.csv"
pd.DataFrame(
    [{
        "n_samples": X_pca_95.shape[0],
        "n_components_95": X_pca_95.shape[1],
    }]
).to_csv(reduced_path, index=False)

print(f"Reduced shape (95% variance): {X_pca_95.shape}")
print(f"Saved: {components_path}")
print(f"Saved: {reduced_path}")

## Mini résumé

- La PCA confirme une forte dimensionnalité intrinsèque du jeu de données.
- Le `scree plot` et la variance cumulée permettent de choisir le compromis nombre de composantes / information conservée.
- La projection PCA 2D donne une première vue de la séparabilité entre classes.
- La réduction à 95% de variance est sauvegardée via les composantes PCA pour réutilisation ultérieure.