In [1]:
# Update working directory

%cd /home/hugo/Bureau/PoincareMSA/

/home/hugo/Bureau/PoincareMSA


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import os
import itertools
import pandas as pd
import torch   # ⬅️ nécessaire pour charger les embeddings AAE (.pt)

# Fonctions internes déjà existantes
from scripts.visualize_projection.pplots_new import (
    read_embeddings,
    plot_embedding_interactive
)

# Fonctions nécessaires à la pipeline
from scripts.prepare_data.load_embeddings import load_custom_embeddings  # ⬅️ si tu veux externaliser
from scripts.projection.build_cmd import build_command  # ⬅️ génère la ligne de commande
from scripts.projection.load_projection import load_projection  # ⬅️ récupère le CSV généré


ModuleNotFoundError: No module named 'scripts.prepare_data.load_embeddings'

In [None]:
data_type = "plm"       # "plm", "plm_aae", "pssm", "RFA_matrix"

# Annotation CSV
path_annotation_csv = "examples/globins/globin_colors_new.csv"

# Embeddings or fasta folder depending on type
embedding_paths = {
    "plm":       "embeddings/ankh_base_globins/",
    "plm_aae":   "embeddings/aae_embeddings/ankh_base_globins/",
    "pssm":      "examples/globins/glob.mfasta",
    "RFA_matrix": None
}

# Output bases
output_base = "grid_search_results/" + data_type
os.makedirs(output_base, exist_ok=True)

In [None]:
grid = {
    "knn":       [5],
    "gamma":     [0.5, 1, 2, 5],
    "sigma":     [1],
    "cospca":    [0],
    "epochs":    [500],
    "seed":      [2, 4],
    "distance":  ["cosine"],
}

In [None]:
def build_command(params, folder_output):
    if data_type in ["plm", "plm_aae"]:
        input_path = embedding_paths[data_type]
        plm_flag = "True"
        mid_output = folder_output + "/matrices/"
        os.makedirs(mid_output, exist_ok=True)

    elif data_type == "pssm":
        input_path = "globins_data/with_mfasta/"
        plm_flag = "False"
        mid_output = input_path

    elif data_type == "RFA_matrix":
        input_path = "None"
        plm_flag = "True"
        mid_output = "RFA_matrix/globins/with_plm_embeddings/"

    cmd = (
        f"python scripts/build_poincare_map/main.py "
        f"--input_path {input_path} "
        f"--output_path {folder_output} "
        f"--plm_embedding {plm_flag} "
        f"--matrices_output_path {mid_output} "
        f"--distlocal {params['distance']} "
        f"--gamma {params['gamma']} "
        f"--pca {params['cospca']} "
        f"--epochs {params['epochs']} "
        f"--seed {params['seed']} "
        f"--knn {params['knn']}"
    )
    return cmd


def load_projection(folder_output, params):
    proj_file = (
        f"{folder_output}/PM{params['knn']}sigma={params['sigma']:.2f}"
        f"gamma={params['gamma']:.2f}{params['distance']}pca={params['cospca']}"
        f"_seed{params['seed']}.csv"
    )
    return proj_file


In [None]:
print("=== Starting grid search ===")

df_annotation = pd.read_csv(path_annotation_csv)
all_scores = []   # stock global

# Generate all parameter combinations
keys = list(grid.keys())
values = list(grid.values())
combinations = list(itertools.product(*values))

for combo in combinations:
    params = dict(zip(keys, combo))

    # Create folder per combination
    folder_name = "_".join([f"{k}{v}" for k, v in params.items()])
    folder_output = os.path.join(output_base, folder_name)
    os.makedirs(folder_output, exist_ok=True)

    print(f"\n--- Running projection for {folder_name} ---")

    # -------------------------------------------------------
    # 1. RUN POINCARÉ MAP
    # -------------------------------------------------------
    cmd = build_command(params, folder_output)
    print("CMD:", cmd)
    os.system(cmd)

    # -------------------------------------------------------
    # 2. LOAD PROJECTION
    # -------------------------------------------------------
    projection_file = load_projection(folder_output, params)
    if not os.path.exists(projection_file):
        print(f"❌ Projection file not found: {projection_file}")
        continue

    df_emb = read_embeddings(projection_file, path_annotation_csv, withroot=False)
    poincare_coord = df_emb[["pm1", "pm2"]].values

    # Rebuild distance matrix
    model = PoincareMaps(poincare_coord)
    model.get_distances()

    # Cluster
    clusters = detect_cluster(model.distances, n_clusters=nc)
    df_emb["cluster"] = clusters

    # -------------------------------------------------------
    # 3. SCORING : FMS + ARI + QNX
    # -------------------------------------------------------

    # External scores
    ARI, FMS = get_scores(df_emb["Color_species"], df_emb["cluster"])

    # QNX scores
    Qlocal, Qglobal, Kmax, df_Q = get_quality_metrics(
        coord_high = features_embed,     # embeddings d’origine
        coord_low  = poincare_coord,     # projection Poincaré
        distance   = "poincare",
        setting    = "manifold",
        k_neighbours = 5
    )

    # score composite
    Score = 0.5 * Qglobal + 0.3 * Qlocal + 0.2 * FMS

    # -------------------------------------------------------
    # 4. SAVE SCORES PER PARAMETER FOLDER
    # -------------------------------------------------------
    df_folder = pd.DataFrame({
        "sigma": [params.get("sigma")],
        "gamma": [params.get("gamma")],
        "knn":   [params.get("knn")],
        "seed":  [params.get("seed")],
        "ARI": [ARI],
        "FMS": [FMS],
        "Qlocal": [Qlocal],
        "Qglobal": [Qglobal],
        "QNX_mean": [df_Q.Qnx.mean()],
        "Score": [Score]
    })

    df_folder.to_csv(os.path.join(folder_output, "scores.csv"), index=False)
    print(f"✔ Saved scores → {folder_output}/scores.csv")

    all_scores.append(df_folder)

    # -------------------------------------------------------
    # 5. PLOT PROJECTION (comme avant)
    # -------------------------------------------------------
    fig_title = f"Poincaré projection: {folder_name}"
    fig = plot_embedding_interactive(
        df_emb,
        labels_name="Color_species",
        color_palette=None,
        title=fig_title
    )

    fig.write_html(os.path.join(folder_output, "projection.html"))
    print(f"✔ Saved projection → {folder_output}/projection.html")

# ===========================================================
# 6. GLOBAL SCORES SUMMARY
# ===========================================================
df_all_scores = pd.concat(all_scores, ignore_index=True)
df_all_scores.sort_values("Score", ascending=False).to_csv(
    os.path.join(output_base, "all_scores_summary.csv"),
    index=False
)

print("\n=== Grid search complete ===")
print("Global score file saved → all_scores_summary.csv")


=== Starting grid search ===

--- Running projection for knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distanceminkowski ---
CMD: python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_globins/ --output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distanceminkowski --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distanceminkowski/matrices/ --distlocal minkowski --gamma 0.5 --pca 0 --epochs 500 --seed 2 --knn 5
CUDA: True
Random seed set as 2
252 proteins found in folder embeddings/ankh_base_globins/.
['230.pt', '53.pt', '6.pt', '150.pt', '236.pt', '188.pt', '235.pt', '21.pt', '117.pt', '71.pt', '193.pt', '52.pt', '95.pt', '3.pt', '103.pt', '239.pt', '240.pt', '104.pt', '175.pt', '159.pt', '158.pt', '141.pt', '155.pt', '197.pt', '163.pt', '8.pt', '245.pt', '67.pt', '37.pt', '26.pt', '142.pt', '14.pt', '94.pt', '78.pt', '125.pt', '56.pt', '152.pt', '160.pt', '238.pt', '136.pt

loss: 2.02307:   2%|▌                          | 11/500 [00:02<01:37,  5.01it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


Interrupting computations at epoch=11

loss = 2.273e+01
time = 0.045 min
❌ Projection file not found: grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distanceminkowski/PM5sigma=1.00gamma=0.50minkowskipca=0_seed2.csv

--- Running projection for knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distancecosine ---
CMD: python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_globins/ --output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distancecosine --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distancecosine/matrices/ --distlocal cosine --gamma 0.5 --pca 0 --epochs 500 --seed 2 --knn 5
CUDA: True
Random seed set as 2
252 proteins found in folder embeddings/ankh_base_globins/.
['230.pt', '53.pt', '6.pt', '150.pt', '236.pt', '188.pt', '235.pt', '21.pt', '117.pt', '71.pt', '193.pt', '52.pt', '95.pt', '3.pt', '103.pt', '239.pt', '240.pt', '104.pt', '175.pt', '1

loss: 1.97403:  13%|███▍                       | 64/500 [00:07<00:25, 16.99it/s]