# HGP clusterer sur SemanticKITTI (local)
Ce notebook prépare l'environnement, récupère **hgp_clusterer** depuis votre GitHub, lance le clustering **en local** sur le benchmark **SemanticKITTI**, enregistre les prédictions panoptiques et affiche un aperçu des résultats.

- Même **pré-traitements** que dans vos notebooks (sélection par `PREPROC`: `none`, `bev_xy`, `bev_xyzi`, `polar`)
- **Aucun post-traitement** (pas de KNN, pas de merges, pas de box split)
- Sauvegarde des `.label` panoptiques dans la structure attendue par SemanticKITTI
- Évaluation THINGS-only optionnelle via `semantic-kitti-api`

In [None]:
# @title 0) Dépendances système et Python
%%bash
set -euo pipefail
apt-get update -qq
# CGAL/TBB/CMake et dépendances classiques
apt-get install -y -qq build-essential cmake libcgal-dev libtbb-dev libtbbmalloc2   libgmp-dev libmpfr-dev libeigen3-dev

In [None]:
# Quelques modules Python utiles
!pip -q install --upgrade pip setuptools wheel Cython cmake jedi
!pip -q install numpy scipy scikit-learn plotly tqdm joblib
# shapely n'est pas requis sans post-processing, on l'évite volontairement ici

In [None]:
# @title 1) Récupération du dépôt hgp_clusterer (GitHub) + cyminiball + CGALDelaunay
import os, subprocess, sys
from pathlib import Path

# === Paramètre clé : URL du dépôt Git de hgp_clusterer ===
# Remplacez par l'URL de votre dépôt si besoin.
REPO_URL = os.environ.get("HGP_REPO_URL", "https://github.com/Ludwig-H/HGP-clusterer.git")

WORKDIR = Path(os.environ.get("HGP_WORKDIR", "/content")).resolve()
WORKDIR.mkdir(parents=True, exist_ok=True)

repo_dir = WORKDIR / "HGP-clusterer"
cymini_dir = WORKDIR / "cyminiball"

def _run(cmd, **kw):
    print("+", cmd)
    subprocess.run(cmd, check=True, **kw)

# Clone / update hgp_clusterer
if repo_dir.exists():
    _run(["git", "-C", str(repo_dir), "pull", "--ff-only"])
else:
    _run(["git", "clone", REPO_URL, str(repo_dir)])

# cyminiball (lib légère utilisée par le projet)
if cymini_dir.exists():
    _run(["git", "-C", str(cymini_dir), "pull", "--ff-only"])
else:
    _run(["git", "clone", "https://github.com/Ludwig-H/cyminiball.git", str(cymini_dir)])

# Build + install cyminiball via wheel local (fiable en Colab)
wheels = WORKDIR / "wheels"
wheels.mkdir(parents=True, exist_ok=True)
_run([sys.executable, "-m", "pip", "wheel", "--no-build-isolation", "--no-deps", "--wheel-dir", str(wheels), str(cymini_dir)])
_run([sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "--no-index", f"--find-links={wheels}", "cyminiball"])

# Compile les binaires CGALDelaunay et installe dans le dépôt
cgal_root = repo_dir / "CGALDelaunay"
projects = [
    "EdgesCGALDelaunay2D",
    "EdgesCGALDelaunay3D",
    "EdgesCGALDelaUNayND".replace("UN", "N"),  # safe typo fix
    "EdgesCGALWeightedDelaunay2D",
    "EdgesCGALWeightedDelaunay3D",
    "EdgesCGALWeightedDelaunayND",
]
for p in projects:
    src = cgal_root / p
    bld = src / "build"
    bld.mkdir(parents=True, exist_ok=True)
    _run(["cmake", "-S", str(src), "-B", str(bld), "-DCMAKE_BUILD_TYPE=Release"])
    _run(["cmake", "--build", str(bld), "--config", "Release", "-j"])
    _run(["cmake", "--install", str(bld), "--prefix", str(repo_dir)])

# Variable d'environnement requise par hgp_clusterer
os.environ["CGALDELAUNAY_ROOT"] = str(cgal_root)
print("CGALDELAUNAY_ROOT =", os.environ["CGALDELAUNAY_ROOT"])

print("Dépôt hgp_clusterer prêt: ", repo_dir)

In [None]:
# @title 2) Imports + configuration centrale
import os, json
import numpy as np
from pathlib import Path
from typing import Optional, Dict

# Import coeur HGP
repo_dir = Path(os.environ.get("HGP_WORKDIR", "/content")) / "HGP-clusterer"
os.environ["CGALDELAUNAY_ROOT"] = str(repo_dir / "CGALDelaunay")
from hgp_clusterer import HypergraphPercol

# === Dossiers ===
DATA_ROOT = "/content/drive/MyDrive/Datasets/semantic_kitti"  #@param {type:"string"}
# Sémantique pour le clustering: "oracle" (vérité terrain), "waffleiron" (preds), "custom" (vos propres prédictions)
SEMANTICS_SOURCE = "waffleiron"  #@param ["oracle", "waffleiron", "custom"]
# Si "waffleiron" ou "custom": racine contenant sequences/*/predictions/*.{label,labels,npy,npz}
PRED_ROOT = "/content/drive/MyDrive/Datasets/semantic_kitti/WaffleIron"  #@param {type:"string"}

# === Séquences à traiter ===
SEQUENCES = ["08"]  #@param {type:"raw"}

# === Paramètres HGP (identiques à votre notebook local) ===
K = 5  #@param {type:"integer"}
min_cluster_size = 50  #@param {type:"integer"}
min_samples = K + 1    #@param {type:"integer"}
method = "eom"         #@param ["eom","leaf"]
splitting = None       #@param ["None", "balanced", "tight"]
weight_face = "lambda" #@param ["lambda","uniform","unique"]
label_all_points = False  #@param {type:"boolean"}
return_multi_clusters = False  #@param {type:"boolean"}
complex_chosen = "orderk_delaunay"  #@param ["orderk_delaunay","delaunay","weighted_delaunay"]
expZ = 3  #@param {type:"integer"}
HGP_VERBOSE = True  #@param {type:"boolean"}

# === Pré-traitements (mêmes options) — pas de post-traitement dans ce notebook ===
PREPROC = "bev_xy"  #@param ["none","bev_xy","bev_xyzi","polar"]

# === Sortie: même structure que SemanticKITTI leaderboard ===
RUN_NAME = f"HGP-K{K}_min{min_cluster_size}_expZ{expZ}_pre{PREPROC}_NOPP"
OUT_ROOT = f"/content/drive/MyDrive/Datasets/semantic_kitti/experiments_semkitti/{RUN_NAME}"
Path(OUT_ROOT).mkdir(parents=True, exist_ok=True)
print("Sortie:", OUT_ROOT)

# === THINGS/ STUFF (labels bruts SemanticKITTI) ===
THING_RAW_IDS = [10,11,15,18,20,30,31,32]  # car, bicycle, motorcycle, truck, other-vehicle, person, bicyclist, motorcyclist
STUFF_RAW_IDS = [40,44,48,49,50,51,70,71,72,80,81]

# Mapping SemanticKITTI -> 20 classes entraînement (utilisé pour l'évaluation)
MAPPER = {0: 0, 1: 0, 10: 1, 11: 2, 13: 5, 15: 3, 16: 5, 18: 4, 20: 5, 30: 6, 31: 7, 32: 8,
          40: 9, 44: 10, 48: 11, 49: 12, 50: 13, 51: 14, 52: 0, 60: 9,
          70: 15, 71: 16, 72: 17, 80: 18, 81: 19, 99: 0, 252: 1, 253: 7, 254: 6,
          255: 8, 256: 5, 257: 5, 258: 4, 259: 5}

In [None]:
# @title 3) Utilitaires I/O (KITTI) + encodage panoptique
import numpy as np
from pathlib import Path
from typing import Optional

def kitti_scan_paths(seq_dir: Path):
    vel_dir = seq_dir / "velodyne"
    label_dir = seq_dir / "labels"
    assert vel_dir.is_dir(), f"Manque velodyne sous {seq_dir}"
    stems = sorted([p.stem for p in vel_dir.glob("*.bin")])
    return vel_dir, label_dir, stems

def read_points_bin(bin_path: Path) -> np.ndarray:
    arr = np.fromfile(str(bin_path), dtype=np.float32)
    return arr.reshape(-1, 4)  # x,y,z,remission

def read_label_file(label_path: Path) -> np.ndarray:
    # 32-bit uint: upper 16 bits = instance id, lower 16 = semantic id
    return np.fromfile(str(label_path), dtype=np.uint32)

def pack_panoptic(semantic: np.ndarray, instance: np.ndarray) -> np.ndarray:
    assert semantic.shape == instance.shape
    return ((instance.astype(np.uint32) << 16) | (semantic.astype(np.uint32)))

def unpack_semantic(label32: np.ndarray) -> np.ndarray:
    return (label32 & 0xFFFF).astype(np.uint16)

def _find_pred_file(pred_dir: Path, stem: str) -> Optional[Path]:
    # Supporte .label, .labels, .npy, .npz
    for ext in (".label", ".labels", ".npy", ".npz"):
        p = pred_dir / f"{stem}{ext}"
        if p.is_file():
            return p
    return None

def _load_semantics_generic(pred_path: Path) -> np.ndarray:
    if pred_path.suffix in (".label", ".labels"):
        u32 = np.fromfile(str(pred_path), dtype=np.uint32)
        return (u32 & 0xFFFF).astype(np.uint16)
    if pred_path.suffix == ".npy":
        arr = np.load(str(pred_path))
        arr = np.array(arr).reshape(-1)
        return arr.astype(np.uint16)
    if pred_path.suffix == ".npz":
        data = np.load(str(pred_path))
        # prend la première clé par défaut
        key = list(data.keys())[0]
        arr = np.array(data[key]).reshape(-1)
        return arr.astype(np.uint16)
    raise ValueError(f"Extension non supportée: {pred_path}")

def load_semantics_for_scan(seq_root: Path, stem: str) -> np.ndarray:
    if SEMANTICS_SOURCE == "oracle":
        _, label_dir, _ = kitti_scan_paths(seq_root)
        gt32 = read_label_file(label_dir / f"{stem}.label")
        return unpack_semantic(gt32)
    else:
        pred_dir = Path(PRED_ROOT) / "sequences" / seq_root.name / "predictions"
        pred_file = _find_pred_file(pred_dir, stem)
        assert pred_file is not None, f"Prediction manquante: {pred_dir}/{stem}.*"
        return _load_semantics_generic(pred_file)

print("I/O utils chargés.")

In [None]:
# @title 4) Pré-traitements (features HGP) — aucun post-traitement ici
import numpy as np

def compute_features(points_xyzi: np.ndarray, mode: str = "none") -> np.ndarray:
    """
    points_xyzi: (N,4) [x,y,z,intensity]
    Retourne les features utilisées pour le clustering HGP.
    """
    x, y, z, i = points_xyzi[:,0], points_xyzi[:,1], points_xyzi[:,2], points_xyzi[:,3]
    if mode == "none":
        return points_xyzi[:, :3]               # x,y,z
    if mode == "bev_xy":
        return np.stack([x, y], axis=1)         # vue du dessus
    if mode == "bev_xyzi":
        return np.stack([x, y, i], axis=1)      # XY + intensité
    if mode == "polar":
        r = np.sqrt(x**2 + y**2)
        theta = np.arctan2(y, x)
        return np.stack([r, theta, z], axis=1)
    raise ValueError(f"Mode PREPROC inconnu: {mode}")

def assign_instances_from_clusters(cluster_ids: np.ndarray, class_ids: np.ndarray, thing_ids: list) -> np.ndarray:
    """
    - cluster_ids: labels de clustering (>0 = instance, -1/noise sera mis à 0)
    - class_ids:   labels sémantiques bruts (SemanticKITTI) pour les points clusterisés, 0 sinon
    - thing_ids:   liste des classes 'thing' (brutes)
    Construit des IDs d'instance >0 uniques par scan, offsettés par classe.
    """
    cluster_ids = np.asarray(cluster_ids).reshape(-1)
    class_ids   = np.asarray(class_ids).reshape(-1)
    inst = np.zeros_like(cluster_ids, dtype=np.int32)
    base = {cid: 1000*(i+1) for i, cid in enumerate(thing_ids)}
    for cid in np.unique(class_ids):
        if cid == 0:
            continue
        mask = class_ids == cid
        clusters = cluster_ids[mask]
        uniq = [u for u in np.unique(clusters) if u > 0]
        # réindexe en 1..K au sein de la classe
        mapping = {u: j+1 for j,u in enumerate(sorted(uniq))}
        out = np.array([mapping.get(v, 0) for v in clusters], dtype=np.int32)
        inst[mask] = base[cid] + out
    return inst

In [None]:
# @title 5) Pipeline HGP: par scan et par classe (sans post-processing)
from tqdm import tqdm

def run_hgp_on_sequence(seq_id: str) -> bool:
    seq_root = Path(DATA_ROOT) / "sequences" / seq_id
    out_pred_dir = Path(OUT_ROOT) / "sequences" / seq_id / "predictions"
    out_pred_dir.mkdir(parents=True, exist_ok=True)

    vel_dir, label_dir, stems = kitti_scan_paths(seq_root)

    # Charge sémantique brute (pour écriture panoptique finale)
    sem_by_stem = {}
    for stem in tqdm(stems, desc=f"[{seq_id}] Prépare sémantique"):
        sem_by_stem[stem] = load_semantics_for_scan(seq_root, stem).astype(np.int32)

    # Pour chaque scan, on clusterise indépendamment par classe 'thing'
    for stem in tqdm(stems, desc=f"[{seq_id}] Clustering HGP"):
        pts = read_points_bin(vel_dir / f"{stem}.bin")  # (N,4)
        feats = compute_features(pts, PREPROC)          # (N,d)
        sem_raw = sem_by_stem[stem]

        N = feats.shape[0]
        cluster_ids = np.zeros(N, dtype=np.int32)
        class_ids   = np.zeros(N, dtype=np.int32)

        for cid in THING_RAW_IDS:
            mask = (sem_raw == cid)
            n = int(mask.sum())
            if n < min_cluster_size:
                continue

            X = feats[mask]

            # Appel direct à HypergraphPercol (local)
            labels = HypergraphPercol(
                M=X,
                K=K,
                min_cluster_size=min_cluster_size,
                min_samples=min_samples,
                method=method,
                splitting=splitting,
                weight_face=weight_face,
                label_all_points=label_all_points,
                return_multi_clusters=return_multi_clusters,
                complex_chosen=complex_chosen,
                expZ=expZ,
                cgal_root=str(repo_dir / "CGALDelaunay"),
                verbeux=HGP_VERBOSE,
            )
            labels = np.asarray(labels).reshape(-1).astype(np.int32)
            # hgp_clusterer renvoie souvent -1 pour le bruit
            labels[labels < 0] = 0

            # Injecte dans les tableaux globaux
            cluster_ids[mask] = labels
            class_ids[mask]   = cid

        # Construit les IDs d'instances panoptiques finaux
        inst = assign_instances_from_clusters(cluster_ids, class_ids, THING_RAW_IDS)

        # Écrit panoptic .label (sémantique brute + instance >0 pour THINGS uniquement)
        panoptic = pack_panoptic(sem_raw.astype(np.uint16), inst.astype(np.uint16))
        (out_pred_dir / f"{stem}.label").write_bytes(panoptic.astype(np.uint32).tobytes())

    return True

# Boucle séquences
for s in tqdm(SEQUENCES, desc="Séquences"):
    ok = run_hgp_on_sequence(s)
    if not ok:
        raise RuntimeError(f"Échec sur séquence {s}")

print("Terminé. Prédictions écrites sous:", OUT_ROOT)

In [None]:
# @title 6) Évaluation THINGS-only (optionnelle, via semantic-kitti-api)
import os, sys, subprocess, json, re
from glob import glob
import numpy as np
from pathlib import Path

EVAL_DIR = Path("/tmp/semkitti_eval")
API_DIR  = EVAL_DIR / "semantic-kitti-api"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

# Clone API if needed
if not API_DIR.exists():
    subprocess.run(["git", "clone", "https://github.com/PRBonn/semantic-kitti-api.git", str(API_DIR)], check=True)

sys.path.insert(0, str(API_DIR))
from evaluation.eval_pq import PanopticEval  # type: ignore

# Split heuristique selon SEQUENCES
split = "valid" if "08" in SEQUENCES else ("test" if any(s in ("11","12","13","14","15","16","17","18","19","20","21") for s in SEQUENCES) else "train")

# MAPPER et classes (comme au-dessus)
MAPPER = {0: 0, 1: 0, 10: 1, 11: 2, 13: 5, 15: 3, 16: 5, 18: 4, 20: 5, 30: 6, 31: 7, 32: 8,
          40: 9, 44: 10, 48: 11, 49: 12, 50: 13, 51: 14, 52: 0, 60: 9,
          70: 15, 71: 16, 72: 17, 80: 18, 81: 19, 99: 0, 252: 1, 253: 7, 254: 6,
          255: 8, 256: 5, 257: 5, 258: 4, 259: 5}
STUFF_CLASS_IDS = [9,10,11,12,13,14,15,16,17,18,19]

PHASE_SCENES = {
    "train": [0,1,2,3,4,5,6,7,9,10],
    "valid": [8],
    "val":   [8],
    "test":  [11,12,13,14,15,16,17,18,19,20,21],
}

def _frames(root, phase):
    phase = "val" if phase == "valid" else phase
    frames = []
    for seq in PHASE_SCENES[phase]:
        frames.extend(sorted(glob(os.path.join(root, "sequences", f"{seq:02d}", "velodyne", "*.bin"))))
    return frames

def _load_gt(bin_path):
    u32 = np.fromfile(bin_path.replace("velodyne","labels")[:-3]+"label", dtype=np.uint32)
    gt_sem_raw = u32 & 0xFFFF
    gt_sem = np.vectorize(MAPPER.get)(gt_sem_raw).astype(np.int32)
    gt_inst = u32 >> 16
    return gt_sem, gt_inst

def _load_pred(pred_root, bin_path):
    rel = bin_path.split("sequences/")[1]
    stem = os.path.basename(rel)[:-4]
    pred_file = os.path.join(pred_root, os.path.dirname(rel), "predictions", f"{stem}.label")
    u32 = np.fromfile(pred_file, dtype=np.uint32)
    pred_sem_raw = u32 & 0xFFFF
    pred_sem = np.vectorize(MAPPER.get)(pred_sem_raw).astype(np.int32)
    pred_inst = u32 >> 16
    mask = np.isin(pred_sem, STUFF_CLASS_IDS + [0])
    pred_inst = pred_inst.copy()
    pred_inst[mask] = 0
    return pred_sem, pred_inst

frames = _frames(DATA_ROOT, split)
ignore = [0] + STUFF_CLASS_IDS
evaluator = PanopticEval(20, ignore=ignore, min_points=int(min_cluster_size))

print(f"Evaluating scans [{split}] — THINGS only | frames={len(frames)}")
for bin_path in frames:
    gt_sem, gt_inst = _load_gt(bin_path)
    pred_sem, pred_inst = _load_pred(OUT_ROOT, bin_path)
    if pred_sem.shape[0] != gt_sem.shape[0]:
        raise RuntimeError(f"Point count mismatch for {bin_path}: pred={pred_sem.shape[0]} gt={gt_sem.shape[0]}")
    evaluator.addBatch(pred_sem, pred_inst, gt_sem, gt_inst)

res = evaluator.getPQ()
print("\nRésultats panoptiques (THINGS):")
print(json.dumps(res, indent=2))

# Résumé
pq = res.get("All", {})
if isinstance(pq, dict):
    pq_th = pq.get("pq", None)
    rq_th = pq.get("rq", None)
    sq_th = pq.get("sq", None)
    if pq_th is not None and rq_th is not None and sq_th is not None:
        print(f"\nPQ_th : {pq_th:.3f} | RQ_th : {rq_th:.3f} | SQ_th : {sq_th:.3f}")

In [None]:
# @title 7) Visualisation 3D rapide (Plotly) — un scan
import numpy as np
import plotly.graph_objects as go
from pathlib import Path

def visualize_one(seq_id="08", index=0, decimate=10):
    seq_root = Path(DATA_ROOT) / "sequences" / seq_id
    vel_dir, label_dir, stems = kitti_scan_paths(seq_root)

    index = int(index) % len(stems)
    stem = stems[index]

    pts = read_points_bin(vel_dir / f"{stem}.bin")
    pred32 = read_label_file(Path(OUT_ROOT) / "sequences" / seq_id / "predictions" / f"{stem}.label")

    pred_sem = unpack_semantic(pred32)
    pred_inst = (pred32.astype(np.uint32) >> 16).astype(np.int32)

    N = pts.shape[0]
    step = max(1, int(decimate))
    idx = np.arange(0, N, step)

    x, y, z = pts[idx,0], pts[idx,1], pts[idx,2]
    inst = pred_inst[idx]

    uniq = np.unique(inst)
    lut = {0: '#808080'}
    WARM = ["#e6194b","#3cb44b","#ffe119","#0082c8","#f58231","#911eb4","#46f0f0","#f032e6","#d2f53c",
            "#fabebe","#008080","#e6beff","#aa6e28","#fffac8","#800000","#aaffc3","#808000","#ffd8b1","#000080","#808080"]
    for u in uniq:
        iu = int(u)
        if iu == 0:
            continue
        # couleur stable par ID
        rng = np.random.default_rng(iu + 12345)
        lut[iu] = WARM[int(rng.integers(0, len(WARM)))]

    color = np.array([lut[int(v)] for v in inst])

    fig = go.Figure(data=[go.Scatter3d(
        x=x, y=y, z=z, mode="markers",
        marker=dict(size=2, color=color, opacity=0.9),
        showlegend=False
    )])
    fig.update_layout(width=900, height=700, title=f"Prediction HGP — seq {seq_id} / frame {stem} (subsample 1/{step})")
    fig.show()

# Exemple d'appel:
# visualize_one("08", 0, 10)