## Directory parameters

In [31]:
# Update working directory

%cd /home/hugo/Bureau/PoincareMSA/

/home/hugo/Bureau/PoincareMSA


In [32]:
import sys

# Add the project root to Python path
project_root = "/home/hugo/Bureau/PoincareMSA"
if project_root not in sys.path:
    sys.path.append(project_root)

## Librairies

In [33]:
import os
import itertools
import pandas as pd
import torch

from scripts.visualize_projection.pplots_new import (
    read_embeddings,
    plot_embedding_interactive
)

from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding_interactive
from scripts.build_poincare_map.poincare_maps import PoincareMaps
from scripts.build_poincare_map.embedding_quality_score import get_quality_metrics
from sklearn.cluster import AgglomerativeClustering, SpectralClustering
from sklearn.metrics import adjusted_rand_score, fowlkes_mallows_score
from scipy import stats
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler 


## Parameters

In [34]:
data_type = "plm"       # "plm", "pssm"                       # Ne fonctionnent pas pour l'instant : "RFA_matrix", "plm_aae"

# Annotation CSV
path_annotation_csv = "examples/kinases/kinase_group_new.csv"

# Embeddings or fasta folder depending on type
embedding_paths = {
    "plm":       "embeddings/ankh_base_kinases/",
    "plm_aae":   "embeddings/aae_embeddings/ankh_base_kinases/",
    "pssm":      "examples/kinases/glob.mfasta",
    "RFA_matrix": None
}

# Output bases
output_base = "grid_search_results/" + data_type
os.makedirs(output_base, exist_ok=True)

In [35]:
grid = {
    "knn":       [5],
    "gamma":     [0.5, 1, 2, 5],
    "sigma":     [1],
    "cospca":    [0],
    "epochs":    [500],
    "seed":      [0, 1, 2],
    "distance":  ["cosine"],
}

## Usefull functions

In [36]:
def get_scores(true_labels, pred_labels):
    ARI = adjusted_rand_score(true_labels, pred_labels)
    FMS = fowlkes_mallows_score(true_labels, pred_labels)
    return ARI, FMS

def detect_cluster(distances, n_clusters=2, clustering_name='agglomerative'):
    if clustering_name == 'agglomerative':
        clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='average').fit(distances)
        labels = clustering.labels_
    return labels


In [37]:
def build_command(params, folder_output, project_root=project_root):

    if data_type in ["plm", "plm_aae"]:
        input_path = embedding_paths[data_type]
        plm_flag = "True"
        mid_output = folder_output + "/matrices/"
        os.makedirs(mid_output, exist_ok=True)
    elif data_type == "pssm":
        # Prepare data with pssm preprocessing
        pssm_prep_path = "kinases_data/fasta0.9"
        if not os.path.exists(pssm_prep_path):
            os.system(f"bash scripts/prepare_data/create_projection.sh scripts/prepare_data examples/kinases/glob.mfasta kinases_data 0.9")

        input_path = pssm_prep_path
        plm_flag = "False"
        mid_output = folder_output + "/matrices/"
        os.makedirs(mid_output, exist_ok=True)
    elif data_type == "RFA_matrix":
        input_path = "None"
        plm_flag = "True"
        mid_output = "RFA_matrix/kinases/with_plm_embeddings/"

    cmd = (
        f"PYTHONPATH={project_root}:$PYTHONPATH python scripts/build_poincare_map/main.py "
        f"--input_path {input_path} "
        f"--output_path {folder_output} "
        f"--plm_embedding {plm_flag} "
        f"--matrices_output_path {mid_output} "
        f"--distlocal {params['distance']} "
        f"--gamma {params['gamma']} "
        f"--pca {params['cospca']} "
        f"--epochs {params['epochs']} "
        f"--seed {params['seed']} "
        f"--knn {params['knn']} "
    )
    return cmd

In [38]:
def load_projection(folder_output, params):
    possible_files = [
        f"{folder_output}/PM{params['knn']}sigma={params['sigma']:.2f}gamma={params['gamma']:.2f}{params['distance']}pca={params['cospca']}_seed{params['seed']}.csv",
        f"{folder_output}/matrices/PM{params['knn']}sigma={params['sigma']:.2f}gamma={params['gamma']:.2f}{params['distance']}pca={params['cospca']}_seed{params['seed']}.csv"
    ]

    for file in possible_files:
        if os.path.exists(file):
            return file

    return possible_files[0]

In [39]:
def compare_projections(results):
    # Convert results to DataFrame
    df_results = pd.DataFrame(results)

    # Create a new column that excludes the seed from the parameter string
    df_results["params_base"] = df_results["params"].apply(
        lambda x: "_".join([f"{k}{v}" for k, v in x.items() if k != "seed"])
    )

    # Add seed as a separate column for visualization
    df_results["seed"] = df_results["params"].apply(lambda x: x["seed"])

    # Calculate mean values for each parameter combination
    df_means = df_results.groupby("params_base").agg({
        "ARI": "mean",
        "FMS": "mean",
        "Qlocal": "mean",
        "Qglobal": "mean"
    }).reset_index()

    # Print available metrics
    print("\n=== Comparison of Projections ===")
    available_cols = ["params_base", "seed", "ARI", "FMS"]
    if "Qlocal" in df_results.columns and df_results["Qlocal"].notna().any():
        available_cols.append("Qlocal")
    if "Qglobal" in df_results.columns and df_results["Qglobal"].notna().any():
        available_cols.append("Qglobal")
    print(df_results[available_cols])

    # Print mean values
    print("\n=== Mean Values by Parameter Combination ===")
    mean_cols = ["params_base", "ARI", "FMS"]
    if "Qlocal" in df_means.columns:
        mean_cols.append("Qlocal")
    if "Qglobal" in df_means.columns:
        mean_cols.append("Qglobal")
    print(df_means[mean_cols])

    # Get unique parameter combinations and their indices
    unique_params = df_results["params_base"].unique()
    param_to_index = {param: idx for idx, param in enumerate(unique_params)}

    # Function to add mean lines to a plot
    def add_mean_lines(fig, metric_name):
        for param_base, mean_val in zip(df_means["params_base"], df_means[metric_name]):
            if param_base in param_to_index:
                x_pos = param_to_index[param_base]
                # Add a horizontal dotted line for the mean
                fig.add_shape(
                    type="line",
                    xref="x", yref="y",
                    x0=x_pos - 0.4, y0=mean_val,
                    x1=x_pos + 0.4, y1=mean_val,
                    line=dict(color="black", width=2, dash="dot"),
                    name=f"Mean {param_base}"
                )
                # Add a small text label for the mean value
                fig.add_annotation(
                    x=x_pos + 0.45,
                    y=mean_val + 0.01,
                    text=f"{mean_val:.3f}",
                    showarrow=False,
                    font=dict(size=10)
                )

    # Plot ARI if available
    if "ARI" in df_results.columns and df_results["ARI"].notna().any():
        fig_ari = px.strip(
            df_results,
            x="params_base",
            y="ARI",
            color="seed",
            title="ARI by parameter combination (colored by seed)"
        )
        add_mean_lines(fig_ari, "ARI")
        fig_ari.update_layout(
            xaxis_title="Parameter combination (excluding seed)",
            yaxis_title="ARI",
            legend_title="Seed",
            showlegend=True
        )
        fig_ari.show()

    # Plot FMS if available
    if "FMS" in df_results.columns and df_results["FMS"].notna().any():
        fig_fms = px.strip(
            df_results,
            x="params_base",
            y="FMS",
            color="seed",
            title="FMS by parameter combination (colored by seed)"
        )
        add_mean_lines(fig_fms, "FMS")
        fig_fms.update_layout(
            xaxis_title="Parameter combination (excluding seed)",
            yaxis_title="FMS",
            legend_title="Seed",
            showlegend=True
        )
        fig_fms.show()

    # Plot Qlocal if available
    if "Qlocal" in df_results.columns and df_results["Qlocal"].notna().any():
        fig_qlocal = px.strip(
            df_results,
            x="params_base",
            y="Qlocal",
            color="seed",
            title="Qlocal by parameter combination (colored by seed)"
        )
        add_mean_lines(fig_qlocal, "Qlocal")
        fig_qlocal.update_layout(
            xaxis_title="Parameter combination (excluding seed)",
            yaxis_title="Qlocal",
            legend_title="Seed",
            showlegend=True
        )
        fig_qlocal.show()

    # Plot Qglobal if available
    if "Qglobal" in df_results.columns and df_results["Qglobal"].notna().any():
        fig_qglobal = px.strip(
            df_results,
            x="params_base",
            y="Qglobal",
            color="seed",
            title="Qglobal by parameter combination (colored by seed)"
        )
        add_mean_lines(fig_qglobal, "Qglobal")
        fig_qglobal.update_layout(
            xaxis_title="Parameter combination (excluding seed)",
            yaxis_title="Qglobal",
            legend_title="Seed",
            showlegend=True
        )
        fig_qglobal.show()

    # Plot Qnx if available
    if "Qnx" in df_results.columns and any(len(qnx) > 0 for qnx in df_results["Qnx"]):
        # Create a palette for parameter combinations
        param_palette = dict(zip(
            df_results["params_base"].unique(),
            sns.color_palette("husl", n_colors=len(df_results["params_base"].unique())).as_hex()
        ))
        plt.figure(figsize=(14, 7))

        # Group by parameter combination
        for param_base, param_group in df_results.groupby("params_base"):
            # For each seed in this parameter group
            for _, row in param_group.iterrows():
                if isinstance(row["Qnx"], (list, np.ndarray)) and len(row["Qnx"]) > 0:
                    seed = row["seed"]
                    # Use a different line style for each seed
                    linestyle = '-' if seed == param_group["seed"].min() else '--'
                    sns.lineplot(
                        x=range(1, len(row["Qnx"]) + 1),
                        y=row["Qnx"],
                        color=param_palette[param_base],
                        linestyle=linestyle,
                        label=f"{param_base} (seed={seed})",
                        linewidth=2 if seed == param_group["seed"].min() else 1
                    )

        # Add mean lines for Qnx (average across seeds for each k value)
        qnx_means = {}
        max_k = 0
        for param_base, param_group in df_results.groupby("params_base"):
            for _, row in param_group.iterrows():
                if isinstance(row["Qnx"], (list, np.ndarray)):
                    max_k = max(max_k, len(row["Qnx"]))

        for k in range(1, max_k + 1):
            for param_base, param_group in df_results.groupby("params_base"):
                # Get all Qnx values for this k position
                qnx_vals = []
                for _, row in param_group.iterrows():
                    if isinstance(row["Qnx"], (list, np.ndarray)) and len(row["Qnx"]) >= k:
                        qnx_vals.append(row["Qnx"][k-1])
                if qnx_vals:
                    qnx_means[(param_base, k)] = np.mean(qnx_vals)

        # Plot the mean lines with horizontal dotted lines
        for (param_base, k), mean_val in qnx_means.items():
            plt.axhline(
                y=mean_val,
                color=param_palette[param_base],
                linestyle=':',
                linewidth=2
            )
            # Add a text label
            plt.text(
                x=max_k + 0.5,
                y=mean_val,
                s=f"{mean_val:.3f}",
                color=param_palette[param_base],
                va='center',
                fontsize=9
            )

        plt.xlabel("K")
        plt.ylabel("Qnx")
        plt.ylim([0, 1.1])
        plt.title("Qnx by parameter combination (different seeds)")
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()


## Grid search & plot

In [40]:
print("=== Starting grid search ===")
df_annotation = pd.read_csv(path_annotation_csv)
# List to store the results of projections
results = []
# Define a custom color palette for visualization
kinase_palette = {-1 : "#c7c7c7", "OTHER": "#c7c7c7", "None" :"#c7c7c7", "NA" : "#c7c7c7", "Uncharacterized" : "#c7c7c7", "root": "#000000",
                  "TYR": "#bd065f", "CMGC": "#d5c203", "TKL": "#997e73","STE": "#80b412", # kinase groups
                  "CK1": "#0dbae9", "AGC": "#00bba1", "CAMK":  "#1f6ed4", "NEK": "#8ce4fa", "RGC":"#f59a62"}
# Generate all parameter combinations
keys = list(grid.keys())
values = list(grid.values())
combinations = list(itertools.product(*values))
for combo in combinations:
    params = dict(zip(keys, combo))
    # Create a folder for each combination
    folder_name = "_".join([f"{k}{v}" for k, v in params.items()])
    folder_output = os.path.join(output_base, folder_name)
    os.makedirs(folder_output, exist_ok=True)
    print(f"\n--- Running projection for {folder_name} ---")
    # 1. Run Poincaré map
    cmd = build_command(params, folder_output + "/")
    print("CMD:", cmd)
    os.system(cmd)
    # 2. Load projection
    projection_file = load_projection(folder_output, params)
    if not os.path.exists(projection_file):
        print(f"Projection file not found: {projection_file}")
        continue
    df_emb = read_embeddings(projection_file, path_annotation_csv, withroot=False)
    # 3. Generate and display interactive plot
    if data_type == "plm":
        title = f"Poincaré projection with embeddings: {folder_name}"
    elif data_type == "plm_aae":
        title = f"Poincaré projection with AAE embeddings: {folder_name}"
    elif data_type == "pssm":
        title = f"Poincaré projection without embeddings: {folder_name}"
    else:
        title = f"Poincaré projection with RFA matrix: {folder_name}"
    fig = plot_embedding_interactive(
        df_emb,
        labels_name="1_Group",
        color_palette=kinase_palette,
        title=title,
        fontsize=11
    )
    fig.show()
    # Save the plot
    output_html = os.path.join(folder_output, f"projection_{folder_name}.html")
    fig.write_html(output_html)
    print(f"✔ Saved projection to → {output_html}")


    # 4. Calculate quality metrics 
    true_labels = df_emb["1_Group"].values 
    coord_low = df_emb[["pm1", "pm2"]].values
    try:
        # Load original high-dimensional data with proper handling for PSSM
        if data_type in ["plm", "plm_aae"]:
            # Load PLM/AAE embeddings
            embedding_files = [f for f in os.listdir(embedding_paths[data_type]) if f.endswith('.pt')]
            features_list = []
            for file in sorted(embedding_files):
                try:
                    embedding_data = torch.load(os.path.join(embedding_paths[data_type], file))
                    if 'aae_embedding' in embedding_data:
                        emb = embedding_data['aae_embedding']
                    elif 'embedding' in embedding_data:
                        emb = embedding_data['embedding']
                    else:
                        emb = next(v for v in embedding_data.values() if isinstance(v, torch.Tensor))
                    if isinstance(emb, torch.Tensor):
                        emb = emb.numpy()
                    if len(emb.shape) == 1:
                        emb = emb.reshape(1, -1)
                    features_list.append(emb)
                except Exception as e:
                    print(f"Error loading {file}: {e}")
                    features_list.append(np.zeros((1, 768)))  # Default embedding size
            features = np.concatenate(features_list, axis=0)
        elif data_type == "pssm":
            # Load PSSM files 
            pssm_files = [f for f in os.listdir("kinases_data/fasta0.9/") if f.endswith('.aamtx')]
            features_list = []
            # Check if we found any PSSM files
            if not pssm_files:
                print(f"No PSSM files found in kinases_data/fasta0.9/")
                result = {
                    "params": params,
                    "ARI": None,
                    "FMS": None,
                    "Qlocal": None,
                    "Qglobal": None,
                    "Qnx": np.zeros(5),
                    "projection_file": projection_file,
                }
                results.append(result)
                continue  # Skip to next parameter combination

            # Load first file to determine expected size
            first_file = os.path.join("kinases_data/fasta0.9/", pssm_files[0])
            first_pssm = np.loadtxt(first_file)
            expected_size = first_pssm.size  # Total elements in the matrix

            for file in sorted(pssm_files):
                try:
                    pssm = np.loadtxt(os.path.join("kinases_data/fasta0.9/", file))
                    # Flatten and ensure consistent size
                    flattened = pssm.flatten()
                    if len(flattened) != expected_size:
                        print(f"Warning: File {file} has unexpected size {len(flattened)}, expected {expected_size}")
                        # Pad or truncate to match expected size
                        if len(flattened) < expected_size:
                            padded = np.zeros(expected_size)
                            padded[:len(flattened)] = flattened
                            features_list.append(padded)
                        else:
                            features_list.append(flattened[:expected_size])
                    else:
                        features_list.append(flattened)
                except Exception as e:
                    print(f"Error loading {file}: {e}")
                    features_list.append(np.zeros(expected_size))

            features = np.array(features_list)

            # Verify we have valid features
            if features.shape[0] == 0:
                print("No valid features loaded from PSSM files")
                result = {
                    "params": params,
                    "ARI": None,
                    "FMS": None,
                    "Qlocal": None,
                    "Qglobal": None,
                    "Qnx": np.zeros(5),
                    "projection_file": projection_file,
                }
                results.append(result)
                continue  # Skip to next parameter combination

        # Debug information
        print(f"DEBUG - Features shape: {features.shape}")
        print(f"DEBUG - Coord_low shape: {coord_low.shape}")
        print(f"DEBUG - True labels length: {len(true_labels)}")

        # Check for empty arrays
        if features.shape[0] == 0:
            print("Warning: Empty features array - cannot calculate metrics")
            result = {
                "params": params,
                "ARI": None,
                "FMS": None,
                "Qlocal": None,
                "Qglobal": None,
                "Qnx": np.zeros(5),
                "projection_file": projection_file,
            }
            results.append(result)
            continue  # Skip to next parameter combination

        # Ensure all arrays have the same number of samples
        min_samples = min(features.shape[0], coord_low.shape[0], len(true_labels))
        features = features[:min_samples]
        coord_low = coord_low[:min_samples]
        true_labels = true_labels[:min_samples]

        # Handle NaN/inf values
        features = np.nan_to_num(features)
        coord_low = np.nan_to_num(coord_low)

        # Standardize features only if we have valid data
        if features.shape[0] > 0 and features.shape[1] > 0:
            scaler = StandardScaler()
            features = scaler.fit_transform(features)
        else:
            print("Warning: Cannot standardize empty or 1D features array")
            # Reshape to 2D if needed
            if len(features.shape) == 1:
                features = features.reshape(-1, 1)

        # Calculate Poincaré distances
        model = PoincareMaps(coord_low)
        model.get_distances()
        D_proj = model.distances

        # Handle any NaN in distance matrix
        D_proj = np.nan_to_num(D_proj)

        # Calculate clustering metrics
        clusters = detect_cluster(D_proj, n_clusters=min(len(np.unique(true_labels)), len(true_labels)-1))
        ARI, FMS = get_scores(true_labels, clusters)

        # Calculate Q metrics with safety checks
        print("Calculating Q metrics...")
        # Ensure we have enough samples for k-neighbors
        k_neighbors = min(5, features.shape[0]-1)
        if k_neighbors < 1:
            k_neighbors = 1

        try:
            Qlocal, Qglobal, Kmax, df_Q = get_quality_metrics(
                coord_high=features,
                coord_low=coord_low,
                distance="poincare",
                setting="manifold",
                k_neighbours=k_neighbors
            )
            print(f"Successfully calculated Q metrics: Qlocal={Qlocal}, Qglobal={Qglobal}")
            # Store all results
            result = {
                "params": params,
                "ARI": ARI,
                "FMS": FMS,
                "Qlocal": Qlocal,
                "Qglobal": Qglobal,
                "Qnx": df_Q.Qnx.to_numpy() if hasattr(df_Q, 'Qnx') else np.zeros(k_neighbors),
                "projection_file": projection_file,
            }
        except Exception as e:
            print(f"Error calculating Q metrics: {e}")
            result = {
                "params": params,
                "ARI": ARI,
                "FMS": FMS,
                "Qlocal": None,
                "Qglobal": None,
                "Qnx": np.zeros(k_neighbors),
                "projection_file": projection_file,
            }
        results.append(result)
    except Exception as e:
        print(f"Error calculating metrics: {e}")
        import traceback
        traceback.print_exc()
        # Store partial results if metrics calculation fails
        result = {
            "params": params,
            "ARI": None,
            "FMS": None,
            "Qlocal": None,
            "Qglobal": None,
            "Qnx": np.zeros(5),
            "projection_file": projection_file,
        }
        results.append(result)
        
print("\n=== Grid search complete ===")

=== Starting grid search ===

--- Running projection for knn5_gamma0.5_sigma1_cospca0_epochs500_seed0_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed0_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed0_distancecosine//matrices/ --distlocal cosine --gamma 0.5 --pca 0 --epochs 500 --seed 0 --knn 5 
CUDA: True
Random seed set as 0
497 proteins found in folder embeddings/ankh_base_kinases/.
['416.pt', '379.pt', '458.pt', '230.pt', '53.pt', '450.pt', '367.pt', '6.pt', '150.pt', '451.pt', '259.pt', '381.pt', '236.pt', '188.pt', '235.pt', '21.pt', '117.pt', '71.pt', '193.pt', '52.pt', '399.pt', '371.pt', '95.pt', '3.pt', '377.pt', '330.pt', '443.pt', '327.pt', '444.pt', '284.pt', '391.pt', '345.pt', '103.pt', '492.pt', '258

loss: 5.85599:  40%|██████████▍               | 200/500 [00:29<00:48,  6.18it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 3.80987:  80%|████████████████████▊     | 400/500 [00:56<00:11,  8.77it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 3.04060: 100%|██████████████████████████| 500/500 [01:08<00:00,  7.28it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 68.73 sec

loss = 3.041e+00
time = 1.152 min


✔ Saved projection to → grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed0_distancecosine/projection_knn5_gamma0.5_sigma1_cospca0_epochs500_seed0_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.06138322142104352, Qglobal=0.5304088865039633

--- Running projection for knn5_gamma0.5_sigma1_cospca0_epochs500_seed1_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed1_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed1_distancecosine//matrices/ --distlocal cosine --gamma 0.5 --pca 0 --epochs 500 --seed 1 --knn 5 
CUDA: True
Random seed set as 1
497 proteins found in 

loss: 5.33527:  40%|██████████▍               | 200/500 [00:29<00:38,  7.87it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 3.28535:  80%|████████████████████▊     | 400/500 [00:58<00:15,  6.65it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 2.55875: 100%|██████████████████████████| 500/500 [01:10<00:00,  7.08it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 70.62 sec

loss = 2.559e+00
time = 1.183 min


✔ Saved projection to → grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed1_distancecosine/projection_knn5_gamma0.5_sigma1_cospca0_epochs500_seed1_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.06636878111286848, Qglobal=0.5331627650530614

--- Running projection for knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distancecosine//matrices/ --distlocal cosine --gamma 0.5 --pca 0 --epochs 500 --seed 2 --knn 5 
CUDA: True
Random seed set as 2
497 proteins found in 

loss: 5.49273:  40%|██████████▍               | 200/500 [00:29<00:34,  8.71it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 3.58104:  80%|████████████████████▊     | 400/500 [00:58<00:17,  5.77it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 2.89784: 100%|██████████████████████████| 500/500 [01:10<00:00,  7.09it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 70.50 sec

loss = 2.898e+00
time = 1.181 min


Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


✔ Saved projection to → grid_search_results/plm/knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distancecosine/projection_knn5_gamma0.5_sigma1_cospca0_epochs500_seed2_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.06282976500005318, Qglobal=0.5333399923653673

--- Running projection for knn5_gamma1_sigma1_cospca0_epochs500_seed0_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed0_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed0_distancecosine//matrices/ --distlocal cosine --gamma 1 --pca 0 --epochs 500 --seed 0 --knn 5 
CUDA: True
Random seed set as 0
497 proteins found in folder e

loss: 2.11158:  40%|██████████▍               | 200/500 [00:25<00:36,  8.15it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.85628:  80%|████████████████████▊     | 400/500 [00:51<00:11,  8.72it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.67943: 100%|██████████████████████████| 500/500 [01:06<00:00,  7.48it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 66.82 sec

loss = 6.794e-01
time = 1.120 min


✔ Saved projection to → grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed0_distancecosine/projection_knn5_gamma1_sigma1_cospca0_epochs500_seed0_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.07293363494319229, Qglobal=0.5309209049579393

--- Running projection for knn5_gamma1_sigma1_cospca0_epochs500_seed1_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed1_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed1_distancecosine//matrices/ --distlocal cosine --gamma 1 --pca 0 --epochs 500 --seed 1 --knn 5 
CUDA: True
Random seed set as 1
497 proteins found in folder embed

loss: 1.94123:  40%|██████████▍               | 200/500 [00:23<00:32,  9.17it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.66193:  80%|████████████████████▊     | 400/500 [00:49<00:11,  8.93it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.48927: 100%|██████████████████████████| 500/500 [01:04<00:00,  7.73it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 64.65 sec

loss = 4.893e-01
time = 1.084 min


Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


✔ Saved projection to → grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed1_distancecosine/projection_knn5_gamma1_sigma1_cospca0_epochs500_seed1_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.07621923040898833, Qglobal=0.5319166808877739

--- Running projection for knn5_gamma1_sigma1_cospca0_epochs500_seed2_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed2_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed2_distancecosine//matrices/ --distlocal cosine --gamma 1 --pca 0 --epochs 500 --seed 2 --knn 5 
CUDA: True
Random seed set as 2
497 proteins found in folder embed

loss: 2.13122:  40%|██████████▍               | 200/500 [00:25<00:34,  8.77it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.80807:  80%|████████████████████▊     | 400/500 [00:53<00:11,  8.73it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.60109: 100%|██████████████████████████| 500/500 [01:07<00:00,  7.38it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 67.71 sec

loss = 6.011e-01
time = 1.135 min


✔ Saved projection to → grid_search_results/plm/knn5_gamma1_sigma1_cospca0_epochs500_seed2_distancecosine/projection_knn5_gamma1_sigma1_cospca0_epochs500_seed2_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.07472969148184803, Qglobal=0.5335820137196443

--- Running projection for knn5_gamma2_sigma1_cospca0_epochs500_seed0_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed0_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed0_distancecosine//matrices/ --distlocal cosine --gamma 2 --pca 0 --epochs 500 --seed 0 --knn 5 
CUDA: True
Random seed set as 0
497 proteins found in folder embed

loss: 0.87935:  40%|██████████▍               | 200/500 [00:24<00:49,  6.10it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.67266:  80%|████████████████████▊     | 400/500 [00:53<00:14,  6.92it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.64078: 100%|██████████████████████████| 500/500 [01:06<00:00,  7.49it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 66.75 sec

loss = 6.408e-01
time = 1.119 min


✔ Saved projection to → grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed0_distancecosine/projection_knn5_gamma2_sigma1_cospca0_epochs500_seed0_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.07074026385610174, Qglobal=0.525185836422304

--- Running projection for knn5_gamma2_sigma1_cospca0_epochs500_seed1_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed1_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed1_distancecosine//matrices/ --distlocal cosine --gamma 2 --pca 0 --epochs 500 --seed 1 --knn 5 
CUDA: True
Random seed set as 1
497 proteins found in folder embedd

loss: 0.59674:  40%|██████████▍               | 200/500 [00:24<00:33,  8.83it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.38017:  80%|████████████████████▊     | 400/500 [00:49<00:15,  6.35it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.35440: 100%|██████████████████████████| 500/500 [01:01<00:00,  8.13it/s]


PM computed in 61.48 sec

loss = 3.544e-01
time = 1.031 min


Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


✔ Saved projection to → grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed1_distancecosine/projection_knn5_gamma2_sigma1_cospca0_epochs500_seed1_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.07119373273081235, Qglobal=0.529877212298107

--- Running projection for knn5_gamma2_sigma1_cospca0_epochs500_seed2_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed2_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed2_distancecosine//matrices/ --distlocal cosine --gamma 2 --pca 0 --epochs 500 --seed 2 --knn 5 
CUDA: True
Random seed set as 2
497 proteins found in folder embedd

loss: 0.54171:  40%|██████████▍               | 200/500 [00:23<00:34,  8.77it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.34625:  80%|████████████████████▊     | 400/500 [00:54<00:12,  8.32it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.31474: 100%|██████████████████████████| 500/500 [01:08<00:00,  7.34it/s]


PM computed in 68.16 sec

loss = 3.147e-01
time = 1.142 min


Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


✔ Saved projection to → grid_search_results/plm/knn5_gamma2_sigma1_cospca0_epochs500_seed2_distancecosine/projection_knn5_gamma2_sigma1_cospca0_epochs500_seed2_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.07018584306542827, Qglobal=0.5312034555316013

--- Running projection for knn5_gamma5_sigma1_cospca0_epochs500_seed0_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed0_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed0_distancecosine//matrices/ --distlocal cosine --gamma 5 --pca 0 --epochs 500 --seed 0 --knn 5 
CUDA: True
Random seed set as 0
497 proteins found in folder embed

loss: 0.87654:  40%|██████████▍               | 200/500 [00:27<00:36,  8.33it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.84695:  80%|████████████████████▊     | 400/500 [00:56<00:11,  8.44it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.83617: 100%|██████████████████████████| 500/500 [01:08<00:00,  7.26it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 68.90 sec

loss = 8.362e-01
time = 1.154 min


✔ Saved projection to → grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed0_distancecosine/projection_knn5_gamma5_sigma1_cospca0_epochs500_seed0_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.07307598943485694, Qglobal=0.526400135511986

--- Running projection for knn5_gamma5_sigma1_cospca0_epochs500_seed1_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed1_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed1_distancecosine//matrices/ --distlocal cosine --gamma 5 --pca 0 --epochs 500 --seed 1 --knn 5 
CUDA: True
Random seed set as 1
497 proteins found in folder embedd

loss: 0.56637:  40%|██████████▍               | 200/500 [00:24<00:33,  8.86it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.48300:  80%|████████████████████▊     | 400/500 [00:56<00:13,  7.15it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.26025: 100%|██████████████████████████| 500/500 [01:08<00:00,  7.30it/s]


PM computed in 68.50 sec

loss = 2.603e-01
time = 1.148 min


Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


✔ Saved projection to → grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed1_distancecosine/projection_knn5_gamma5_sigma1_cospca0_epochs500_seed1_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.05872556769186548, Qglobal=0.5241381229232911

--- Running projection for knn5_gamma5_sigma1_cospca0_epochs500_seed2_distancecosine ---
CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path embeddings/ankh_base_kinases/ --output_path grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed2_distancecosine/ --plm_embedding True --matrices_output_path grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed2_distancecosine//matrices/ --distlocal cosine --gamma 5 --pca 0 --epochs 500 --seed 2 --knn 5 
CUDA: True
Random seed set as 2
497 proteins found in folder embed

loss: 0.58958:  40%|██████████▍               | 200/500 [00:25<00:51,  5.82it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.54104:  80%|████████████████████▊     | 400/500 [00:52<00:14,  6.78it/s]Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.
loss: 0.52107: 100%|██████████████████████████| 500/500 [01:07<00:00,  7.42it/s]
Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


PM computed in 67.36 sec

loss = 5.211e-01
time = 1.129 min


Ignoring fixed x limits to fulfill fixed data aspect with adjustable data limits.


✔ Saved projection to → grid_search_results/plm/knn5_gamma5_sigma1_cospca0_epochs500_seed2_distancecosine/projection_knn5_gamma5_sigma1_cospca0_epochs500_seed2_distancecosine.html
DEBUG - Features shape: (136980, 768)
DEBUG - Coord_low shape: (497, 2)
DEBUG - True labels length: 497
Calculating Q metrics...
Successfully calculated Q metrics: Qlocal=0.07007226549120772, Qglobal=0.5291702140344852

=== Grid search complete ===


## Results

In [41]:
# Call comparison function
compare_projections(results)


=== Comparison of Projections ===
                                          params_base  seed       ARI  \
0   knn5_gamma0.5_sigma1_cospca0_epochs500_distanc...     0  0.692297   
1   knn5_gamma0.5_sigma1_cospca0_epochs500_distanc...     1  0.697516   
2   knn5_gamma0.5_sigma1_cospca0_epochs500_distanc...     2  0.700811   
3   knn5_gamma1_sigma1_cospca0_epochs500_distancec...     0  0.582911   
4   knn5_gamma1_sigma1_cospca0_epochs500_distancec...     1  0.531474   
5   knn5_gamma1_sigma1_cospca0_epochs500_distancec...     2  0.663168   
6   knn5_gamma2_sigma1_cospca0_epochs500_distancec...     0  0.554402   
7   knn5_gamma2_sigma1_cospca0_epochs500_distancec...     1  0.555161   
8   knn5_gamma2_sigma1_cospca0_epochs500_distancec...     2  0.663647   
9   knn5_gamma5_sigma1_cospca0_epochs500_distancec...     0  0.683618   
10  knn5_gamma5_sigma1_cospca0_epochs500_distancec...     1  0.695707   
11  knn5_gamma5_sigma1_cospca0_epochs500_distancec...     2  0.622473   

         FMS   