# 01 — Data exploration (minimal)

Objectifs de cette étape :
- vérifier le nombre de classes et leur distribution,
- inspecter les statistiques globales (min / max / mean),
- détecter des valeurs aberrantes simples,
- visualiser le déséquilibre de classes et sauvegarder les figures.

In [None]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Ensure project root is importable from notebook context.
project_root = Path.cwd().resolve()
if not (project_root / "src").exists():
    project_root = project_root.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src.data_loader import load_dataset
from src.visualization import save_figure

sns.set_theme(style="whitegrid")
fig_dir = project_root / "results" / "figures"

ds = load_dataset(
    data_path=str(project_root / "data" / "raw" / "data.csv"),
    labels_path=str(project_root / "data" / "raw" / "labels.csv"),
)

X, y = ds.X, ds.y
print(f"Samples: {X.shape[0]} | Features: {X.shape[1]} | Classes: {y.nunique()}")

In [None]:
class_counts = y.value_counts().sort_values(ascending=False)
class_pct = (class_counts / class_counts.sum() * 100).round(2)

print("Class distribution:")
print(pd.DataFrame({"count": class_counts, "percent": class_pct}))

fig = plt.figure(figsize=(11, 5))
ax = sns.barplot(x=class_counts.index, y=class_counts.values, hue=class_counts.index, dodge=False, legend=False)
ax.set_title("Class imbalance - sample count per cancer class")
ax.set_xlabel("Class")
ax.set_ylabel("Number of samples")
ax.tick_params(axis="x", rotation=45)
save_figure(fig, str(fig_dir / "01_class_imbalance_barplot.png"))
plt.show()

In [None]:
global_stats = {
    "global_min": float(X.min().min()),
    "global_max": float(X.max().max()),
    "global_mean": float(X.mean().mean()),
}
print("Global feature stats:", global_stats)

sample_mean = X.mean(axis=1)
sample_std = X.std(axis=1)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(sample_mean, bins=40, kde=True, ax=axes[0], color="#4C72B0")
axes[0].set_title("Distribution of sample-wise mean expression")
axes[0].set_xlabel("Mean expression per sample")
axes[0].set_ylabel("Count")

sns.histplot(sample_std, bins=40, kde=True, ax=axes[1], color="#55A868")
axes[1].set_title("Distribution of sample-wise std expression")
axes[1].set_xlabel("Std expression per sample")
axes[1].set_ylabel("Count")

save_figure(fig, str(fig_dir / "01_sample_expression_stats.png"))
plt.show()

In [None]:
# Simple outlier detection based on sample mean z-score.
z_scores = (sample_mean - sample_mean.mean()) / sample_mean.std(ddof=0)
outlier_mask = z_scores.abs() > 3
n_outliers = int(outlier_mask.sum())

print(f"Potential outlier samples (|z| > 3): {n_outliers} / {len(sample_mean)}")

fig = plt.figure(figsize=(12, 4))
plt.plot(sample_mean.values, marker="o", linestyle="", markersize=3, alpha=0.65)
plt.scatter(
    sample_mean.index[outlier_mask],
    sample_mean[outlier_mask],
    color="red",
    s=18,
    label="Potential outliers",
)
plt.title("Outlier check using sample-wise mean expression")
plt.xlabel("Sample index")
plt.ylabel("Mean expression")
plt.legend()
save_figure(fig, str(fig_dir / "01_outlier_check_sample_mean.png"))
plt.show()

## Mini résumé

- Le dataset contient **801 échantillons** et **20531 gènes** (features).
- La distribution des classes met en évidence le niveau de **class imbalance** (voir bar plot).
- Les statistiques globales (min/max/mean) sont calculées pour vérifier l’échelle des valeurs.
- Un contrôle simple des valeurs aberrantes a été réalisé via le z-score de la moyenne d’expression par échantillon.
- Figures sauvegardées dans `results/figures/`.