# Installation de HypergraphPercol

In [None]:
%%bash
# Colab: paquets système pour CGAL/TBB/CMake
apt-get update -qq
apt-get install -y -qq build-essential cmake libcgal-dev libtbb-dev libtbbmalloc2 \
    libgmp-dev libmpfr-dev libeigen3-dev

In [None]:
!pip install -q --upgrade pip setuptools wheel Cython cmake jedi

In [None]:
%%bash
set -euo pipefail
WORKDIR="${HGP_WORKDIR:-/content}"
mkdir -p "${WORKDIR}"
cd "${WORKDIR}"
if [ -d HypergraphPercol ]; then
    git -C HypergraphPercol pull --ff-only
else
    git clone https://github.com/Ludwig-H/HypergraphPercol.git
fi
if [ -d cyminiball ]; then
    git -C cyminiball pull --ff-only
else
    git clone https://github.com/Ludwig-H/cyminiball.git
fi


In [None]:
%%bash
set -euo pipefail
WORKDIR="${HGP_WORKDIR:-/content}"
mkdir -p "${WORKDIR}/wheels"
cd "${WORKDIR}/cyminiball"
python3 -m pip wheel --no-build-isolation --no-deps --wheel-dir="${WORKDIR}/wheels" .
python3 -m pip install --force-reinstall --no-deps --no-index --find-links="${WORKDIR}/wheels" cyminiball
# Le "--no-deps" indispensable pour que numpy ne se télécharge pas en version 2.3.4, créant des problèmes de compatibilité...


In [None]:
%%bash
set -euo pipefail
WORKDIR="${HGP_WORKDIR:-/content}"
cd "${WORKDIR}/HypergraphPercol"
python3 scripts/setup_cgal.py


In [None]:
%%bash
set -euo pipefail
WORKDIR="${HGP_WORKDIR:-/content}"
cd "${WORKDIR}/HypergraphPercol/CGALDelaunay"

projects=(
    EdgesCGALDelaunay2D
    EdgesCGALDelaunay3D
    EdgesCGALDelaunayND
    EdgesCGALWeightedDelaunay2D
    EdgesCGALWeightedDelaunay3D
    EdgesCGALWeightedDelaunayND
)

for project in "${projects[@]}"; do
    cmake -S "${project}" -B "${project}/build" -DCMAKE_BUILD_TYPE=Release
    cmake --build "${project}/build" --config Release
    cmake --install "${project}/build" --prefix "${WORKDIR}/HypergraphPercol"
done


In [None]:
%%bash
set -euo pipefail
WORKDIR="${HGP_WORKDIR:-/content}"
cd "${WORKDIR}/HypergraphPercol"
python3 -m pip install --no-deps --force-reinstall .


In [None]:
workdir = os.environ.get("HGP_WORKDIR", "/content")
repo_root = os.path.join(workdir, "HypergraphPercol")
os.environ["CGALDELAUNAY_ROOT"] = os.path.join(repo_root, "CGALDelaunay")

from hypergraphpercol import HypergraphPercol

# Application de HypergraphPercol aux *reads* Nanopore

In [None]:
# @title Setup: dépendances et données
!pip -q install datasketch edlib numpy pandas scipy scikit-learn tqdm pyarrow

import os, sys, re, math, random, itertools, json
from pathlib import Path

DATA_DIR = Path("data")
DATA_DIR.mkdir(exist_ok=True, parents=True)

# Raw URLs du dépôt Microsoft (dataset seul)
CENTERS_URL  = "https://raw.githubusercontent.com/microsoft/clustered-nanopore-reads-dataset/main/Centers.txt"
CLUSTERS_URL = "https://raw.githubusercontent.com/microsoft/clustered-nanopore-reads-dataset/main/Clusters.txt"

def download(url, dst):
    import urllib.request
    if not Path(dst).exists():
        print(f"Téléchargement {url} -> {dst}")
        urllib.request.urlretrieve(url, dst)
    else:
        print(f"Déjà présent: {dst}")

download(CENTERS_URL,  DATA_DIR/"Centers.txt")
download(CLUSTERS_URL, DATA_DIR/"Clusters.txt")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/397.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.5/397.5 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hTéléchargement https://raw.githubusercontent.com/microsoft/clustered-nanopore-reads-dataset/main/Centers.txt -> data/Centers.txt
Téléchargement https://raw.githubusercontent.com/microsoft/clustered-nanopore-reads-dataset/main/Clusters.txt -> data/Clusters.txt


In [None]:
# @title Parsing des fichiers { vertical-output: true }
from typing import List, Tuple
import numpy as np
import pandas as pd

def parse_clusters(clusters_path: Path) -> Tuple[List[str], np.ndarray]:
    reads = []
    labels = []
    label  = -1
    with open(clusters_path, "r") as f:
        for raw in f:
            s = raw.strip()
            if not s:
                continue
            if s.startswith("="):           # lignes de séparation "===="
                label += 1
                continue
            reads.append(s)
            labels.append(label)
    return reads, np.array(labels, dtype=np.int32)

reads, labels = parse_clusters(DATA_DIR/"Clusters.txt")
n_reads = len(reads)
n_clusters_seen = labels.max() + 1
print(f"{n_reads} reads, {n_clusters_seen} clusters (certains peuvent être vides).")

# Option: charger Centers pour évaluation ultérieure
centers = [line.strip() for line in open(DATA_DIR/'Centers.txt') if line.strip()]
print(f"{len(centers)} centers de longueur attendue 110.")


269709 reads, 10000 clusters (certains peuvent être vides).
10000 centers de longueur attendue 110.


In [None]:
# @title Construction des MinHash (k-mers) et LSH  { vertical-output: true }
from datasketch import MinHash, MinHashLSH
# from tqdm import tqdm

NUM_PERM      = 128   # taille du sketch MinHash
KMER          = 15    # k-mer size pour 110 nt
LSH_THRESHOLD = 0.35  # seuil Jaccard approximatif → candidats
SEED          = 1337

random.seed(SEED)

def kmers(seq: str, k: int):
    L = len(seq)
    if L < k:
        return []
    return [seq[i:i+k] for i in range(L - k + 1)]

# Dépendances (une seule fois dans ton notebook)
!pip -q install joblib tqdm tqdm-joblib

from joblib import Parallel, delayed, cpu_count
from tqdm.auto import tqdm
try:
    from tqdm_joblib import tqdm_joblib
except ImportError:
    tqdm_joblib = None  # fallback séquentiel si non dispo

def build_minhash_for_reads(reads, k=KMER, num_perm=NUM_PERM, n_jobs=None, batch_size=64):
    """
    Parallélise la construction des MinHash avec joblib.
    - n_jobs: nb de workers (par défaut: tous les CPU).
    - batch_size: taille des lots envoyés à chaque worker (réduit l'overhead pickle).
    Garde l'ordre des reads dans la sortie.
    """

    if n_jobs is None:
        n_jobs = cpu_count()

    def _one_read_to_minhash(s: str):
        m = MinHash(num_perm=num_perm, seed=SEED)
        for kmer in kmers(s, k):
            m.update(kmer.encode("utf8"))
        return m

    # Si tqdm_joblib est dispo, on a une barre qui avance au fil des batches;
    # sinon, on fait une boucle séquentielle avec tqdm standard.
    if n_jobs == 1 or tqdm_joblib is None:
        # Fallback propre (utile si environnement léger)
        return [_one_read_to_minhash(s) for s in tqdm(reads, desc="MinHash", unit="read")]
    else:
        with tqdm_joblib(tqdm(total=len(reads), desc="MinHash (parallel)", unit="read")):
            sketches = Parallel(
                n_jobs=n_jobs,
                prefer="processes",  # hashing CPU-bound -> process pool
                batch_size=batch_size
            )(delayed(_one_read_to_minhash)(s) for s in reads)
        return sketches


sketches = build_minhash_for_reads(reads, k=KMER, num_perm=NUM_PERM)

# LSH pour requêter des voisins candidats
lsh = MinHashLSH(threshold=LSH_THRESHOLD, num_perm=NUM_PERM)
for i, m in enumerate(sketches):
    lsh.insert(i, m)
print("Index LSH construit.")


MinHash (parallel):   0%|          | 0/269709 [00:00<?, ?read/s]

  0%|          | 0/269709 [00:00<?, ?it/s]

Index LSH construit.


In [None]:
# @title Graphe sparse: top-k voisins par nœud via distance d'édition  { vertical-output: true }
import edlib
from tqdm import tqdm

TOP_K          = 20        # voisins conservés par nœud (après LSH)
MAX_EDGES      = None      # limite dure sur le nombre d'arêtes (None = pas de limite)
NORMALIZE_BY   = "maxlen"  # "maxlen" ou "meanlen"

def norm_edit(a: str, b: str) -> float:
    dist = edlib.align(a, b, task="distance")["editDistance"]
    if NORMALIZE_BY == "maxlen":
        denom = max(len(a), len(b))
    else:
        denom = 0.5*(len(a)+len(b))
    return dist / max(1, denom)

edges = []
edge_count = 0
for i, mi in tqdm(list(enumerate(sketches)), total=len(sketches), desc="Voisins + distances"):
    candidates = lsh.query(mi)
    candidates = [j for j in candidates if j != i]
    if not candidates:
        continue
    # score Jaccard approx pour prioriser
    scored = [(j, mi.jaccard(sketches[j])) for j in candidates]
    scored.sort(key=lambda t: t[1], reverse=True)
    keep = scored[:TOP_K]
    for j, jac in keep:
        if j < i:
            # construire une seule fois l'arête (i<->j non orienté)
            continue
        d = norm_edit(reads[i], reads[j])
        edges.append((i, j, float(d), float(jac)))
        edge_count += 1
        if MAX_EDGES and edge_count >= MAX_EDGES:
            break
    if MAX_EDGES and edge_count >= MAX_EDGES:
        break

print(f"{len(edges)} arêtes construites (non orientées).")


Voisins + distances: 100%|██████████| 269709/269709 [00:49<00:00, 5488.92it/s]


1439780 arêtes construites (non orientées).
