[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DSIMB/PoincareMSA/blob/master/PoincareMSA_colab.ipynb)

# Directory Parameters

In [43]:
# Update working directory

%cd /home/hugo/Bureau/PoincareMSA/

/home/hugo/Bureau/PoincareMSA



This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.



In [44]:
import sys

# Add the project root to Python path
project_root = "/home/hugo/Bureau/PoincareMSA"
if project_root not in sys.path:
    sys.path.append(project_root)

## Librairies

In [45]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics.pairwise import pairwise_distances
from scripts.build_poincare_map.model import PoincareEmbedding
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding_interactive
import os

# Parameters

In [46]:
path_embedding = "test_add_point_2/PM5sigma=1.00gamma=1.00cosinepca=0_seed4.csv"
gamma = 1
sigma = 1
distance = "cosine"

path_features = "/home/hugo/Bureau/PoincareMSA/test_add_point_2/features.csv"

## Exemple : inférer et ajouter un nouveau point

Ce bloc montre comment utiliser la méthode `infer_embedding_for_point` de `PoincareEmbedding` pour estimer la position d'un nouveau point dans une projection déjà calculée.

Guide rapide :
- Si tu disposes des features du nouveau point (`new_feat`) ou d'un vecteur de distances `new_to_existing`, le code calcule une cible (distribution) puis infère l'embedding.
- Le vecteur estimé est retourné et ajoutable au CSV d'embed existant.

Exécute la cellule suivante (code) après avoir défini `path_embedding` et, si disponible, `features` et `new_feat` ou `new_to_existing`.

======= Exemple d'utilisation de `infer_embedding_for_point` ======= \
Ce code suppose que `path_embedding` est défini plus haut dans le notebook. \
Il essaie d'utiliser `features` + `new_feat` ou `new_to_existing` si disponibles.

In [47]:
# 1) Charger la projection existante
try:
    df = pd.read_csv(path_embedding)
except Exception as e:
    raise RuntimeError(f"Impossible de charger {path_embedding}: {e}")

if 'pm1' not in df.columns or 'pm2' not in df.columns:
    raise ValueError("Le fichier d'embeddings doit contenir les colonnes 'pm1' et 'pm2'.")

embs = df[['pm1', 'pm2']].to_numpy(dtype=float)
N, dim = embs.shape
print(f"Chargé {N} embeddings de dimension {dim}")

Chargé 497 embeddings de dimension 2


In [48]:
# 2) Charger les features
try:
    features = pd.read_csv(path_features, index_col=0, header = None)
except Exception as e:
    raise RuntimeError(f"Impossible de charger {path_features}: {e}")

In [49]:
# 3) Construire un modèle et injecter les poids existants
model = PoincareEmbedding(size=N, dim=dim, gamma=gamma if 'gamma' in globals() else 2, lossfn='klSym', Qdist='laplace', cuda=False)
with torch.no_grad():
    model.lt.weight.data = torch.from_numpy(embs).float()

In [50]:
# Pour new_feat on prend la moyenne des deux premières features
new_feat = 0.99*features.iloc[70:71].mean().to_numpy()

In [51]:
new_feat.shape

(4599,)

In [52]:
features.shape

(497, 4599)

In [53]:
# 4) Préparer le vecteur `target` (taille N) : plusieurs options
import numpy as np
# Safety checks and alignment: ensure `features` matches the embedding rows (N)
if 'features' not in globals():
    raise RuntimeError('`features` not loaded; run the features-loading cell before this one')
# Convert to numpy array; we'll attempt to align by IDs if possible
feats = np.asarray(features)
# If the embedding CSV contains an identifier column, try to reindex features to match that order
if 'proteins_id' in df.columns:
    try:
        ids = df['proteins_id'].astype(str).tolist()
        feats_df = features.copy()
        # If features index contains the same IDs, reindex to the embedding order
        if all(str(x) in feats_df.index.astype(str) for x in ids):
            feats = feats_df.reindex(ids).to_numpy()
            print('Aligned features using df[] index')
        else:
            # not able to align by id; warn the user but continue (we'll check sizes)
            print('Warning: could not align features by proteins_id index; proceeding with current ordering')
    except Exception as e:
        print('Warning while attempting to align features by id:', e)
# Now ensure feats has the expected number of rows
if feats.shape[0] != N:
    msg = (f'Number of feature rows ({feats.shape[0]}) does not match number of embeddings N ({N}).',
           ' Common causes: mismatched ordering, missing/extra rows, index/header problems in features CSV.',
           'Check `features.shape`, `df[].head()` and `features.index[:5]` to diagnose.')
    raise ValueError(' '.join(msg))
# Compute target from available inputs
if 'new_feat' in globals():
    new_feat_arr = np.asarray(new_feat).reshape(1, -1)
    d = pairwise_distances(new_feat_arr, feats, metric=distance if 'distance' in globals() else 'cosine').flatten()
    target = np.exp(-d / (sigma if 'sigma' in globals() else 1.0))
    if target.sum() <= 0:
        target = np.ones(N, dtype=float) / float(N)
    else:
        target = target / target.sum()
    print('Computed target from new_feat; shape=', target.shape)
else:
    target = np.ones(N, dtype=float) / float(N)
    print('Aucun vecteur fourni : utilisation dune cible uniforme (démo)')

Computed target from new_feat; shape= (497,)


In [54]:
# Debug : vérifier formes
print("N embeddings (attendu) =", embs.shape[0])
print("features shape =", getattr(features, 'shape', None))
t = np.asarray(target)
print("target raw shape =", t.shape)
# si target est 2D, aplatis pour sécurité
if t.ndim > 1:
    t = t.ravel()
print("target final shape =", t.shape)
# remplacer target si on a aplati
target = t

N embeddings (attendu) = 497
features shape = (497, 4599)
target raw shape = (497,)
target final shape = (497,)


In [55]:
# # 5) Inférer l'embedding pour le nouveau point
# new_emb = model.train_single_point(target, n_steps=10000, lr=0.02, init='random')

# print('Embedding inféré :', new_emb)
# new_emb = new_emb[0]

In [64]:
# 5) Inférer l'embedding pour le nouveau point (barycentre hyperbolique)
# Sélection des voisins (exemple)
k = 5
topk = np.argsort(-target)[:k]
neighbor_embs = torch.tensor(embs[topk], dtype=torch.float32)
neighbor_w = torch.tensor(target[topk], dtype=torch.float32)
neighbor_w = neighbor_w / neighbor_w.sum()

new_emb = model.hyperbolic_barycenter(
    neighbor_embs,   # embeddings des voisins (k x dim)
    neighbor_w,      # poids normalisés (k,)
    n_steps=300,
    lr=0.05
)

new_emb = new_emb.detach().cpu().numpy().reshape(-1)

print("Embedding inféré (barycentre hyperbolique) :", new_emb)
print("Norme :", np.linalg.norm(new_emb))


Embedding inféré (barycentre hyperbolique) : [-0.41755393 -0.5065367 ]
Norme : 0.65645313


In [65]:
new_emb

array([-0.41755393, -0.5065367 ], dtype=float32)

In [58]:
# 6) Ajouter au DataFrame et sauvegarder
df_new = df.copy()
new_row = {'pm1': float(new_emb[0]), 'pm2': float(new_emb[1]), 'proteins_id': 'NEW_POINT_1'}
df_new = pd.concat([df_new, pd.DataFrame([new_row])], ignore_index=True)
out_path = path_embedding.replace('.csv', '_with_new.csv')
df_new.to_csv(out_path, index=False)
print(f'Projection augmentée sauvegardée dans {out_path}')

Projection augmentée sauvegardée dans test_add_point_2/PM5sigma=1.00gamma=1.00cosinepca=0_seed4_with_new.csv


In [59]:
# Il faudra le générér automatiquement -> pour l'instant c'est manuel
path_annotation_csv = '/home/hugo/Bureau/PoincareMSA/test_add_point_2/kinase_group_new.appended_custom_label.csv'
path_embedding_new = '/home/hugo/Bureau/PoincareMSA/test_add_point_2/PM5sigma=1.00gamma=1.00cosinepca=0_seed4_with_new.csv'

In [60]:
#Check files
#mfasta
nb_seq = 0

if path_annotation_csv :
    if os.path.isfile(path_annotation_csv):
        try:
            df_annotation = pd.read_csv(path_annotation_csv)
        except:
            raise ValueError("Annotation file is not in .csv format.")

        #Add id column
        if "proteins_id" not in df_annotation.columns:
            df_annotation.insert(0, "proteins_id", range(len(df_annotation)))
        path_annotation = path_annotation_csv
            
        print("\nAnnotation file correctly loaded.")
        annotation_names = list(df_annotation.columns)
        print(f"{len(annotation_names)} annotations found: {annotation_names}.")
    else:
        print(f"File {path_annotation_csv} not found.")


Annotation file correctly loaded.
20 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain', 'small_cluster', 'custom_label'].


In [61]:
# Prepare data for visualization

df_embedding = read_embeddings(path_embedding_new, path_annotation, withroot=False)

#Here are different labels found in your annotation file (if one uploaded):
print(f"{len(annotation_names)} annotations found: {annotation_names}.")

20 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain', 'small_cluster', 'custom_label'].


In [62]:
# Construction of custom color palette 
kinase_palette = {'NEW_POINT_1' : 'red', "Other" : "black", "Used" : "blue"}

In [63]:
# OPTIONS =================================================
#Here you can set different parameters to color & annotate the resulting projection:

title = "Poincaré projection of the kinase 1 protein family without embeddings" 

#----------------------------------------------------------
# Select the coloring from annotation .csv file:
labels_name = "custom_label"
# Select classes to label among the "labels_name" or "second_labels_name" column (comma separated list):
second_labels_name = ""
labels_text = []
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = kinase_palette #Default: None
use_custom_palette = True
#==========================================================


#Check projection visualization parameters
#Labels name
if labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")
#Second labels name
if second_labels_name == "":
    second_labels_name = None
elif second_labels_name not in annotation_names:
    raise NameError(f'"second_labels_name" {second_labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}')

if not use_custom_palette:
    color_palette = None

#Plot graph
fig = plot_embedding_interactive(df_embedding, 
                                 labels_name = labels_name,
                                 second_labels_name = second_labels_name, 
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
fig.show()