[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DSIMB/PoincareMSA/blob/master/PoincareMSA_colab.ipynb)

# Directory Parameters

In [384]:
# Update working directory

%cd /home/hugo/Bureau/PoincareMSA/

/home/hugo/Bureau/PoincareMSA



This is now an optional IPython functionality, setting dhist requires you to install the `pickleshare` library.



In [385]:
import sys

# Add the project root to Python path
project_root = "/home/hugo/Bureau/PoincareMSA"
if project_root not in sys.path:
    sys.path.append(project_root)

## Librairies

In [386]:
import numpy as np
import pandas as pd
import torch
from sklearn.metrics.pairwise import pairwise_distances
from scripts.build_poincare_map.model import PoincareEmbedding
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding_interactive
import os

# Parameters

In [387]:
path_embedding = "test_add_point_2/PM5sigma=1.00gamma=1.00cosinepca=0_seed4.csv"
gamma = 1
sigma = 1
distance = "cosine"

path_features = "/home/hugo/Bureau/PoincareMSA/test_add_point_2/features.csv"

## Exemple : inférer et ajouter un nouveau point

Ce bloc montre comment utiliser la méthode `infer_embedding_for_point` de `PoincareEmbedding` pour estimer la position d'un nouveau point dans une projection déjà calculée.

Guide rapide :
- Si tu disposes des features du nouveau point (`new_feat`) ou d'un vecteur de distances `new_to_existing`, le code calcule une cible (distribution) puis infère l'embedding.
- Le vecteur estimé est retourné et ajoutable au CSV d'embed existant.

Exécute la cellule suivante (code) après avoir défini `path_embedding` et, si disponible, `features` et `new_feat` ou `new_to_existing`.

======= Exemple d'utilisation de `infer_embedding_for_point` ======= \
Ce code suppose que `path_embedding` est défini plus haut dans le notebook. \
Il essaie d'utiliser `features` + `new_feat` ou `new_to_existing` si disponibles.

In [388]:
# 1) Charger la projection existante
try:
    df = pd.read_csv(path_embedding)
except Exception as e:
    raise RuntimeError(f"Impossible de charger {path_embedding}: {e}")

if 'pm1' not in df.columns or 'pm2' not in df.columns:
    raise ValueError("Le fichier d'embeddings doit contenir les colonnes 'pm1' et 'pm2'.")

embs = df[['pm1', 'pm2']].to_numpy(dtype=float)
N, dim = embs.shape
print(f"Chargé {N} embeddings de dimension {dim}")

Chargé 497 embeddings de dimension 2


In [389]:
# 2) Charger les features
try:
    features = pd.read_csv(path_features, index_col=0, header = None)
except Exception as e:
    raise RuntimeError(f"Impossible de charger {path_features}: {e}")

In [390]:
# 3) Construire un modèle et injecter les poids existants
model = PoincareEmbedding(size=N, dim=dim, gamma=gamma if 'gamma' in globals() else 2, lossfn='klSym', Qdist='laplace', cuda=False)
with torch.no_grad():
    model.lt.weight.data = torch.from_numpy(embs).float()

In [391]:
# Pour new_feat on prend la moyenne des deux premières features
new_feat = 0.99*features.iloc[70:71].mean().to_numpy()

In [392]:
new_feat.shape

(4599,)

In [393]:
features.shape

(497, 4599)

In [394]:
# 4) Préparer le vecteur `target` (taille N) : plusieurs options
import numpy as np
# Safety checks and alignment: ensure `features` matches the embedding rows (N)
if 'features' not in globals():
    raise RuntimeError('`features` not loaded; run the features-loading cell before this one')
# Convert to numpy array; we'll attempt to align by IDs if possible
feats = np.asarray(features)
# If the embedding CSV contains an identifier column, try to reindex features to match that order
if 'proteins_id' in df.columns:
    try:
        ids = df['proteins_id'].astype(str).tolist()
        feats_df = features.copy()
        # If features index contains the same IDs, reindex to the embedding order
        if all(str(x) in feats_df.index.astype(str) for x in ids):
            feats = feats_df.reindex(ids).to_numpy()
            print('Aligned features using df[] index')
        else:
            # not able to align by id; warn the user but continue (we'll check sizes)
            print('Warning: could not align features by proteins_id index; proceeding with current ordering')
    except Exception as e:
        print('Warning while attempting to align features by id:', e)
# Now ensure feats has the expected number of rows
if feats.shape[0] != N:
    msg = (f'Number of feature rows ({feats.shape[0]}) does not match number of embeddings N ({N}).',
           ' Common causes: mismatched ordering, missing/extra rows, index/header problems in features CSV.',
           'Check `features.shape`, `df[].head()` and `features.index[:5]` to diagnose.')
    raise ValueError(' '.join(msg))
# Compute target from available inputs
if 'new_feat' in globals():
    new_feat_arr = np.asarray(new_feat).reshape(1, -1)
    d = pairwise_distances(new_feat_arr, feats, metric=distance if 'distance' in globals() else 'cosine').flatten()
    target = np.exp(-d / (sigma if 'sigma' in globals() else 1.0))
    if target.sum() <= 0:
        target = np.ones(N, dtype=float) / float(N)
    else:
        target = target / target.sum()
    print('Computed target from new_feat; shape=', target.shape)
else:
    target = np.ones(N, dtype=float) / float(N)
    print('Aucun vecteur fourni : utilisation dune cible uniforme (démo)')

Computed target from new_feat; shape= (497,)


In [395]:
# Debug : vérifier formes
print("N embeddings (attendu) =", embs.shape[0])
print("features shape =", getattr(features, 'shape', None))
t = np.asarray(target)
print("target raw shape =", t.shape)
# si target est 2D, aplatis pour sécurité
if t.ndim > 1:
    t = t.ravel()
print("target final shape =", t.shape)
# remplacer target si on a aplati
target = t

N embeddings (attendu) = 497
features shape = (497, 4599)
target raw shape = (497,)
target final shape = (497,)


In [396]:
# 5a) Recharger le module modifié et reconstruire le modèle avec les poids existants
import importlib
import scripts.build_poincare_map.model as model_mod
importlib.reload(model_mod)
from scripts.build_poincare_map.model import PoincareEmbedding
import torch

# Rebuild model with the same weights (variable `embs` should already exist from earlier cells)
model = PoincareEmbedding(size=N, dim=dim, gamma=gamma if 'gamma' in globals() else 1, lossfn='klSym', Qdist='laplace', cuda=False)
with torch.no_grad():
    model.lt.weight.data = torch.from_numpy(embs).float()
print('Module rechargé ; model.hyperbolic_barycenter exists =', hasattr(model, 'hyperbolic_barycenter'))

Module rechargé ; model.hyperbolic_barycenter exists = True


In [397]:
# 5b) Recompute target like training, compute barycenter init, run inference and compare distances
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import torch

# index of the feature you want to re-infer (you said index 0)
idx = 1

# recompute 'target' consistently with training: use model.gamma and same metric
feats = np.asarray(features)
new_feat_arr = np.asarray(features.iloc[idx:idx+1]).reshape(1, -1)
d = pairwise_distances(new_feat_arr, feats, metric=distance if 'distance' in globals() else 'cosine').flatten()
target = np.exp(- (model.gamma if hasattr(model, 'gamma') else gamma) * d)
target = target / target.sum()
print('target.shape =', target.shape, 'sum=', target.sum())

# compute barycenter warm-start using the model method (select top-k neighbors)
k = min(1, len(target))
topk = np.argsort(-target)[:k]
neighbor_embs = torch.tensor(embs[topk], dtype=torch.float32)
neighbor_w = torch.tensor(target[topk], dtype=torch.float32)
neighbor_w = neighbor_w / neighbor_w.sum()

x0 = model.hyperbolic_barycenter(neighbor_embs, neighbor_w, n_steps=2000, tol=1e-7, alpha=1.0)
x0_np = x0.detach().cpu().numpy().reshape(-1)
print('Barycenter init (norm) =', np.linalg.norm(x0_np))

# 1) Infer with barycenter init
new_emb_bary = model.infer_embedding_for_point(target, n_steps=500, lr=0.05, init='barycenter')

# 2) Infer with explicit init_vec (the barycenter) — alternative call
new_emb_bary2 = model.infer_embedding_for_point(target, n_steps=500, lr=0.05, init='random', init_vec=x0_np)

# 3) Option: refine with train_single_point using local attraction (strong lambda_local)
new_emb_train, losses = model.train_single_point(target, n_steps=500, lr=0.02, init='barycenter', k=30, lambda_local=5.0)

# Compare with original embedding (assume original is embs[idx])
orig = torch.tensor(embs[idx], dtype=torch.float32).unsqueeze(0).unsqueeze(0)
b1 = torch.tensor(new_emb_bary, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
b2 = torch.tensor(new_emb_bary2, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
b3 = torch.tensor(new_emb_train, dtype=torch.float32).unsqueeze(0).unsqueeze(0)

dist_orig_b1 = model.dist().apply(b1, orig).item()
dist_orig_b2 = model.dist().apply(b2, orig).item()
dist_orig_b3 = model.dist().apply(b3, orig).item()

print('Poincaré dist original <-> infer(bary)   :', dist_orig_b1)
print('Poincaré dist original <-> infer(init_vec):', dist_orig_b2)
print('Poincaré dist original <-> train_single  :', dist_orig_b3)

# Also compute distance between barycenter init and original (diagnostic)
bary = x0.detach().cpu().unsqueeze(0)
dist_bary_orig = model.dist().apply(bary, orig).item()
print('Poincaré dist original <-> barycenter init :', dist_bary_orig)

print('If these distances are small (near 0) inference reproduces original placement.')
print('If distances remain large, tune: use the exact training gamma/Qdist, increase n_steps, lower lr, or increase lambda_local in train_single_point.')

target.shape = (497,) sum= 1.0
Barycenter init (norm) = 0.4843409
Poincaré dist original <-> infer(bary)   : 0.964045524597168
Poincaré dist original <-> infer(init_vec): 0.9640513062477112
Poincaré dist original <-> train_single  : 0.026352813467383385
Poincaré dist original <-> barycenter init : 0.0
If these distances are small (near 0) inference reproduces original placement.
If distances remain large, tune: use the exact training gamma/Qdist, increase n_steps, lower lr, or increase lambda_local in train_single_point.



<class 'scripts.build_poincare_map.model.PoincareDistance'> should not be instantiated. Methods on autograd functionsare all static, so you should invoke them on the class itself. Instantiating an autograd function will raise an error in a future version of PyTorch.


<class 'scripts.build_poincare_map.model.PoincareDistance'> should not be instantiated. Methods on autograd functionsare all static, so you should invoke them on the class itself. Instantiating an autograd function will raise an error in a future version of PyTorch.


<class 'scripts.build_poincare_map.model.PoincareDistance'> should not be instantiated. Methods on autograd functionsare all static, so you should invoke them on the class itself. Instantiating an autograd function will raise an error in a future version of PyTorch.


<class 'scripts.build_poincare_map.model.PoincareDistance'> should not be instantiated. Methods on autograd functionsare all static, so you should invoke them on the class itself. Instantiatin

In [398]:
x0_np

array([-0.30219817,  0.37850016], dtype=float32)

In [399]:
new_emb_bary[0]

np.float32(-0.15540653)

In [400]:
# 6) Ajouter au DataFrame et sauvegarder
df_new = df.copy()

# Tests ------------

# 0) Juste barycentre
new_row = {'pm1': float(x0_np[0]), 'pm2': float(x0_np[1]), 'proteins_id': 999999}

# 1) Infer with barycenter init
# new_row = {'pm1': float(new_emb_bary[0]), 'pm2': float(new_emb_bary[1]), 'proteins_id': 999999}

# 2) Infer with explicit init_vec (the barycenter) — alternative call
# new_row = {'pm1': float(new_emb_bary2[0]), 'pm2': float(new_emb_bary2[1]), 'proteins_id': 999999}

# 3) Option: refine with train_single_point using local attraction (strong lambda_local)
#new_row = {'pm1': float(new_emb_train[0]), 'pm2': float(new_emb_train[1]), 'proteins_id': 999999}

# -----------------


df_new = pd.concat([df_new, pd.DataFrame([new_row])], ignore_index=True)
out_path = path_embedding.replace('.csv', '_with_new.csv')
df_new.to_csv(out_path, index=False)
#print(f'Projection augmentée sauvegardée dans {out_path}')    tol=1e-7,    alpha=1.0)new_emb = new_emb.detach().cpu().numpy().reshape(-1)print("Embedding inféré (barycentre hyperbolique) :", new_emb)print("Norme :", np.linalg.norm(new_emb))


In [401]:
# Il faudra le générér automatiquement -> pour l'instant c'est manuel
path_annotation_csv = '/home/hugo/Bureau/PoincareMSA/test_add_point_2/kinase_group_new.appended_custom_label.csv'
path_embedding_new = '/home/hugo/Bureau/PoincareMSA/test_add_point_2/PM5sigma=1.00gamma=1.00cosinepca=0_seed4_with_new.csv'

In [402]:
#Check files
#mfasta
nb_seq = 0

if path_annotation_csv :
    if os.path.isfile(path_annotation_csv):
        try:
            df_annotation = pd.read_csv(path_annotation_csv)
        except:
            raise ValueError("Annotation file is not in .csv format.")

        #Add id column
        if "proteins_id" not in df_annotation.columns:
            print('proteins_id not in df_annotation.columns')
            df_annotation.insert(0, "proteins_id", range(len(df_annotation)))
        path_annotation = path_annotation_csv
            
        print("\nAnnotation file correctly loaded.")
        annotation_names = list(df_annotation.columns)
        print(f"{len(annotation_names)} annotations found: {annotation_names}.")
    else:
        print(f"File {path_annotation_csv} not found.")


Annotation file correctly loaded.
20 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain', 'small_cluster', 'custom_label'].


In [403]:
# Prepare data for visualization

df_embedding = read_embeddings(path_embedding_new, path_annotation, withroot=False)

#Here are different labels found in your annotation file (if one uploaded):
print(f"{len(annotation_names)} annotations found: {annotation_names}.")

20 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain', 'small_cluster', 'custom_label'].


In [404]:
df_embedding.head()

Unnamed: 0_level_0,pm1,pm2,proteins_id,1_Group,2_Gene,3_HGNC,4_Uni_entry,5_Uni_acc,6_Domain_begin,7_Domain_end,...,10_PDB_validation,11_Conformational_state,12_Dihedral_state,13_Group_in_Uni,14_Group_in_Manning,15_Synonymn,evo_distance,decile_domain,small_cluster,custom_label
proteins_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.30271,0.578413,1,AGC,AKT1,HGNC:391,AKT1_HUMAN,P31749,150,408,...,6NPZB,DFGin,BLAminus,AGC,AGC,"PKB,RAC",1.208266,2,RAC,Other
2,0.302444,0.577548,2,AGC,AKT2,HGNC:392,AKT2_HUMAN,P31751,152,409,...,3E8DB,DFGin,BLAminus,AGC,AGC,,1.208312,2,RAC,Other
3,0.302553,0.579029,3,AGC,AKT3,HGNC:393,AKT3_HUMAN,Q9Y243,148,405,...,,,,AGC,AGC,PKBG,1.183463,2,RAC,Other
4,0.361276,0.691123,4,AGC,CDC42BPA,HGNC:1737,MRCKA_HUMAN,Q5VT25,77,343,...,,,,AGC,AGC,KIAA0451,1.399119,5,DMPK,Other
5,0.361732,0.693415,5,AGC,CDC42BPB,HGNC:1738,MRCKB_HUMAN,Q9Y5S2,76,342,...,5OTFA,DFGin,BLAminus,AGC,AGC,KIAA1124,1.406213,5,DMPK,Other


In [405]:
# Construction of custom color palette 
kinase_palette = {'NEW_POINT_1' : 'red', "Other" : "black", "Used" : "blue"}

In [406]:
# # Construction of custom color palette 
# kinase_palette = {-1 : "#c7c7c7", "OTHER": "#c7c7c7", "None" :"#c7c7c7", "NA" : "#c7c7c7", "Uncharacterized" : "#c7c7c7", "root": "#000000",
#                   "TYR": "#bd065f", "CMGC": "#d5c203", "TKL": "#997e73","STE": "#80b412", # kinase groups 
#                   "CK1": "#0dbae9", "AGC": "#00bba1", "CAMK":  "#1f6ed4", "NEK": "#8ce4fa", "RGC":"#f59a62"}

In [407]:
# OPTIONS =================================================
#Here you can set different parameters to color & annotate the resulting projection:

title = "Poincaré projection of the kinase 1 protein family without embeddings" 

#----------------------------------------------------------
# Select the coloring from annotation .csv file:
labels_name = "custom_label"
# Select classes to label among the "labels_name" or "second_labels_name" column (comma separated list):
second_labels_name = ""
labels_text = []
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = kinase_palette #Default: None
use_custom_palette = True
#==========================================================


#Check projection visualization parameters
#Labels name
if labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")
#Second labels name
if second_labels_name == "":
    second_labels_name = None
elif second_labels_name not in annotation_names:
    raise NameError(f'"second_labels_name" {second_labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}')

if not use_custom_palette:
    color_palette = None

#Plot graph
fig = plot_embedding_interactive(df_embedding, 
                                 labels_name = labels_name,
                                 second_labels_name = second_labels_name, 
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
fig.show()