[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DSIMB/PoincareMSA/blob/master/PoincareMSA_colab.ipynb)

<img src="https://github.com/DSIMB/PoincareMSA/blob/master/.github/PoincareMSA_small_logo.png?raw=true" height="100" style="height:100px;margin-left: 0px;">

# Poincaré maps for visualization of large protein famillies

**Authors**: Anna Klimovskaia Susmelj, Yani Ren, Yann Vander Meersche, Jean-Christophe Gelly and Tatiana Galochkina

PoincaréMSA builds an interactive projection of an input protein multiple sequence alignemnt (MSA) using a method based on Poincaré maps described by Klimovskaia et al [1]. It reproduces both local proximities of protein sequences and hierarchy contained in give data. Thus, sequences located closer to the center of projection correspond to the proteins sharing the most general functional properites and/or appearing at the earlier stages of evolution. Source code is available at https://github.com/DSIMB/PoincareMSA.

[1] Klimovskaia, A., Lopez-Paz, D., Bottou, L. et al. Poincaré maps for analyzing complex hierarchies in single-cell data. Nat Commun 11, 2966 (2020).

# Directory Parameters

In [1]:
# Update working directory

%cd /home/hugo/Bureau/PoincareMSA/

/home/hugo/Bureau/PoincareMSA


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import sys

# Add the project root to Python path
project_root = "/home/hugo/Bureau/PoincareMSA"
if project_root not in sys.path:
    sys.path.append(project_root)

# Parameters

In [3]:
# GENERAL OPTIONS ==========================================
# Embedding or without
data_type = "distance_matrix" #  "pssm"   "plm"   "RFA_matrix"   "plm_aae"   "distance_matrix"
#----------------------------------------------------------
#Annotation file (.csv) or UniProt ID list. (Emtpy strings for no annotations)
path_annotation_csv = "examples/kinases/kinase_group_new.csv"   # Path or ""
# OR
path_uniprot_list = ""   # Path or ""
#==========================================================

# POINCARE PARAMETERS ====================================
# Here you control different parameters of Poincaré maps.
# In our computational experiments the best results were achieved for the following values provided by default.
# The impact of different parameters is analyzed in the original paper [1].
knn = 5
gamma = 1
sigma = 1
cospca = 0
batchs = 4
epochs = 10
seed = 4
distance = "minkowski"  #"minkowski"  "cosine"
#==========================================================

In [4]:
# OPTIONS WITH DISTANCE MATRIX ============================
# Input the distance matrix file here
distance_matrix = "kinases_data/test_add_point_mfasta/distance_matrix.csv"
mid_output = 'kinases_data/test_add_point_mfasta/outputs/'
labels = 'kinases_data/test_add_point_mfasta/labels.csv'
feat_path = "kinases_data/test_add_point_mfasta/features.csv" # feature path
path_annotation = "/home/hugo/Bureau/PoincareMSA/kinases_data/test_add_point_mfasta/kinase_group_new.csv"
# Input True if the matrix comes from plm and False for pssm
matrix_plm = True
# Folder for the results
out_name_results = 'kinases_data/test_add_point_mfasta/results/' # Input desired name of output folder
#==========================================================

# Librairies

In [5]:
#Load dependencies
import os
import numpy as np
import pandas as pd
import subprocess
import json
import copy
import warnings
warnings.filterwarnings('ignore')

#Import visualization functions
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
from scripts.prepare_data.mmseqs2_api import run_mmseqs2
from scripts.prepare_data.uniprot_idmapping_api import submit_id_mapping, check_id_mapping_results_ready, get_id_mapping_results_link, get_id_mapping_results_search

%matplotlib inline

#Create optional variables
path_annotation = ""

from scripts.build_poincare_map.data import append_point_to_feature_and_distance, append_centroid_to_embedding, append_annotation_for_new_point, find_k_neighbors_for_new_point
import scripts.build_poincare_map.data as data_mod

# import importlib
# from importlib import reload
# reload(data_mod)
# importlib.reload(data_mod)

# Data import

In [6]:
#Check files
#mfasta
nb_seq = 0


#Check that only one path is selected
if path_annotation_csv and path_uniprot_list:
    raise ValueError("Use only one file path (path_annotation_csv OR path_uniprot_list).")

#Check that only one path is selected
if path_annotation_csv and path_uniprot_list:
    raise ValueError("Use only one file path (path_annotation_csv OR path_uniprot_list).")

if path_annotation_csv :
    if os.path.isfile(path_annotation_csv):
        try:
            df_annotation = pd.read_csv(path_annotation_csv)
        except:
            raise ValueError("Annotation file is not in .csv format.")
        else:
            if len(df_annotation) != nb_seq and (data_type != "RFA_matrix" and data_type != "distance_matrix"):
                raise ValueError("Annotation file doesn't match the .mfasta file length.")

        #Add id column
        if "proteins_id" not in df_annotation.columns:
            df_annotation.insert(0, "proteins_id", range(len(df_annotation)))
        path_annotation = path_annotation_csv
            
        print("\nAnnotation file correctly loaded.")
        annotation_names = list(df_annotation.columns)
        print(f"{len(annotation_names)} annotations found: {annotation_names}.")
    else:
        print(f"File {path_annotation_csv} not found.")


Annotation file correctly loaded.
19 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain', 'small_cluster'].


# Projection

## Function for bash building command

In [7]:
in_name = "None"
        
def build_command(data_type=data_type, project_root=project_root, in_name=in_name, out_name_results=out_name_results, 
                  mid_output=mid_output, distance=distance, gamma=gamma, cospca=cospca, epochs=epochs, seed=seed, knn=knn):

    plm_flag = matrix_plm

    cmd = (
        f"PYTHONPATH={project_root}:$PYTHONPATH python scripts/build_poincare_map/main.py "
        f"--input_path {in_name} "
        f"--output_path {out_name_results} "
        f"--plm_embedding {plm_flag} "
        f"--matrices_output_path {mid_output} "
        f"--distlocal {distance} "
        f"--gamma {gamma} "
        f"--pca {cospca} "
        f"--epochs {epochs} "
        f"--seed {seed} "
        f"--knn {knn} "
        f"--method {data_type} "
        f"--distance_matrix {distance_matrix} "
        f"--labels {labels} "
    )

    return cmd

## Original projection using bash command

In [8]:
cmd = build_command()
print("CMD:", cmd)
os.system(cmd)

CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path None --output_path kinases_data/test_add_point_mfasta/results/ --plm_embedding True --matrices_output_path kinases_data/test_add_point_mfasta/outputs/ --distlocal minkowski --gamma 1 --pca 0 --epochs 10 --seed 4 --knn 5 --method distance_matrix --distance_matrix kinases_data/test_add_point_mfasta/distance_matrix.csv --labels kinases_data/test_add_point_mfasta/labels.csv 


INFO: CUDA available: True
INFO: Random seed set as 4
INFO: KNN matrix CSV file saved to kinases_data/test_add_point_mfasta/outputs/KNN_matrix.csv
INFO: Computing laplacian...
INFO: Laplacian computed in 0.15 sec
INFO: Computing RFA...
INFO: RFA computed in 0.01 sec
INFO: RFA matrix computed (tensor shape (497, 497))
INFO: RFA matrix CSV file saved to kinases_data/test_add_point_mfasta/outputs/RFA_matrix.csv
INFO: Starting training...
loss: 45.47999: 100%|███████████████████████████| 10/10 [00:01<00:00,  7.17it/s]
INFO: 
loss = 4.548e+01
time = 0.030 min


PM computed in 1.39 sec




0

## New point on the projection

In [9]:
# Example: append a new feature vector and update the distance matrix

# If you don't have a features.csv, read features from the mid_output used earlier
if not os.path.exists(feat_path):
    print('features file not found at', feat_path)

# Here we try to add the mean point between the first and the second point of our data
df_feat = pd.read_csv(feat_path, header=None)



###### INPUT THE NEW VECTOR HERE ######  Here we test it by using the mean of the two first vectors of the matrix
new_vec = (df_feat.iloc[0].values.astype(float) + df_feat.iloc[1].values.astype(float))/2 
#######################################


# Prepare output filenames so originals are preserved
out_feat = mid_output + 'features_with_new_vec.csv' #feat_path.replace('.csv', '.with_new_vec.csv')
out_dist = mid_output + 'distance_matrix_with_new_vec.csv'  #distance_matrix.replace('.csv', '.with_new_vec.csv')
out_labels = None
if 'labels' in globals() and labels:
    out_labels = mid_output + 'labels_with_new_vec.csv'  #labels.replace('.csv', '.with_new_vec.csv')

# Call helper (this will write the updated files)
res = append_point_to_feature_and_distance(
    features_path=feat_path,
    distance_path=distance_matrix,
    new_feature=new_vec,
    new_id='New_point',
    labels_path=labels if ('labels' in globals() and labels) else None,
    out_features_path=out_feat,
    out_distance_path=out_dist,
    out_labels_path=out_labels,
    metric='euclidean'
)
print('Append result:', res)

Append result: {'features_path': 'kinases_data/test_add_point_mfasta/outputs/features_with_new_vec.csv', 'distance_path': 'kinases_data/test_add_point_mfasta/outputs/distance_matrix_with_new_vec.csv', 'features_shape': (498, 4600), 'distance_shape': (498, 498), 'labels_path': 'kinases_data/test_add_point_mfasta/outputs/labels_with_new_vec.csv', 'labels_shape': (498, 1)}


In [10]:
new_vec

array([0.11655   , 0.0263    , 0.06925   , ..., 0.0243    , 2.14199996,
       0.0734    ], shape=(4600,))

In [None]:
neighbors = find_k_neighbors_for_new_point(feat_path, new_vec, k=5, labels_path=labels, metric='euclidean')
print("Top-5 neighbors of new point:")
for n in neighbors:
    print(n)

Top-5 neighbors of new point:
{'index': 0, 'distance': 16.368855078361438, 'protein_id': '74'}
{'index': 1, 'distance': 16.36885507836152, 'protein_id': '358'}
{'index': 131, 'distance': 17.96526532586899, 'protein_id': '359'}
{'index': 321, 'distance': 17.977529536690408, 'protein_id': '73'}
{'index': 324, 'distance': 18.045918084359208, 'protein_id': '360'}


In [13]:
emb_path = out_name_results + "PM5sigma=1.00gamma=1.00minkowskipca=0_seed4.csv"

out_emb = emb_path.replace('.csv', '_centroid_with_new_vec.csv')
df_app, neighbors_used = append_centroid_to_embedding(
    embedding_path=emb_path,
    features_path=feat_path,
    new_feature=new_vec,
    k=5,
    labels_path=labels,
    new_id='New_point',
    out_embedding_path=out_emb,
    metric='euclidean'
)

print('Appended embedding written to:', out_emb)
print('Last row (the centroid):')
print(df_app.tail(1).to_string(index=False))

print('Neighbors used to compute centroid:')
for n in neighbors_used:
    print(n)

Appended embedding written to: kinases_data/test_add_point_mfasta/results/PM5sigma=1.00gamma=1.00minkowskipca=0_seed4_centroid_with_new_vec.csv
Last row (the centroid):
     pm1       pm2 proteins_id
0.011436 -0.033091   New_point
Neighbors used to compute centroid:
{'index': 0, 'distance': 16.368855078361438, 'protein_id': '74'}
{'index': 1, 'distance': 16.36885507836152, 'protein_id': '358'}
{'index': 131, 'distance': 17.96526532586899, 'protein_id': '359'}
{'index': 321, 'distance': 17.977529536690408, 'protein_id': '73'}
{'index': 324, 'distance': 18.045918084359208, 'protein_id': '360'}


In [14]:
# Suppose new protein id assigned earlier is 'NEW_BY_CENTROID' (string) or an int
new_id = "New_point"

df_ann_app = append_annotation_for_new_point(path_annotation, new_id=new_id)
print("Wrote appended annotation; last row:")
print(df_ann_app.tail(1).to_string(index=False))

Wrote appended annotation; last row:
proteins_id   1_Group    2_Gene    3_HGNC 4_Uni_entry 5_Uni_acc  6_Domain_begin  7_Domain_end  8_Domain_length  9_Largest_insert_length 10_PDB_validation 11_Conformational_state 12_Dihedral_state 13_Group_in_Uni 14_Group_in_Manning 15_Synonymn  evo_distance  decile_domain small_cluster
  New_point New_point New_point New_point   New_point New_point               0             0                0                        0         New_point               New_point         New_point       New_point           New_point   New_point           0.0              0     New_point


### Create interactive plot

In [None]:
# Prepare data for visualization
#Check that an annotation file was provided. Create a dummy one instead

df_embedding = read_embeddings(emb_path, path_annotation, withroot=False)

path_annotation_new = "/home/hugo/Bureau/PoincareMSA/kinases_data/test_add_point_mfasta/kinase_group_new_with_new_vec.csv"

df_embedding_new = read_embeddings(out_emb, path_annotation_new, withroot=False)

#Here are different labels found in your annotation file (if one uploaded):
print(f"{len(annotation_names)} annotations found: {annotation_names}.")

19 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain', 'small_cluster'].


# Projection visualization

## Visualization

In [18]:
(
neighbor_indices, neighbor_pids) = ([], [])
try:
    neighbor_indices = [int(n.get('index')) for n in neighbors_used if n.get('index') is not None] if 'neighbors_used' in globals() else []
    neighbor_pids = [str(n.get('protein_id')) for n in neighbors_used if n.get('protein_id') is not None] if 'neighbors_used' in globals() else []
except Exception:
    neighbor_indices = []
    neighbor_pids = []

# Prepare highlight column on both embeddings (original and new)
for df_name in ('df_embedding', 'df_embedding_new'):
    if df_name not in globals():
        continue
    df = globals()[df_name]
    # default category
    df['Legend'] = 'Other'
    # mark neighbors by index when available and valid
    if len(neighbor_indices) > 0:
        valid_idx = [i for i in neighbor_indices if i is not None and 0 <= int(i) < len(df)]
        if len(valid_idx) > 0:
            df.loc[df.index.isin(valid_idx), 'Legend'] = 'Neighbor'
    # fallback: mark neighbors by proteins_id if embedding has that column
    if 'proteins_id' in df.columns and len(neighbor_pids) > 0:
        mask2 = df['proteins_id'].astype(str).isin(neighbor_pids)
        df.loc[mask2, 'Legend'] = 'Neighbor'
    # mark new point in the augmented embedding (if present and proteins_id available)
    if df_name == 'df_embedding_new' and 'new_id' in globals() and 'proteins_id' in df.columns:
        df.loc[df['proteins_id'].astype(str) == str(new_id), 'Legend'] = 'New_point'
    # assign back to globals so downstream code sees the column
    globals()[df_name] = df

# Build a palette that includes our categories (neighbors + new point)
palette = {'Neighbor': 'orange', 'New_point': 'red', 'Other': 'black'}

# Plot without the new point (df_embedding)
if 'df_embedding' in globals():
    fig = plot_embedding_interactive(df_embedding, labels_name='Legend', color_palette=palette, title='Poincaré map (without new point)', fontsize=11)
    fig.show()

# Plot with the new point (df_embedding_new)
if 'df_embedding_new' in globals():
    fig2 = plot_embedding_interactive(df_embedding_new, labels_name='Legend', color_palette=palette, title='Poincaré map (with new point)', fontsize=11)
    fig2.show()

### Save plot to file

In [None]:
# # OPTIONS =================================================
# output_name = "kinases"
# output_format = "html" #Format availables: ["png", "html", "pdf", "svg"]
# #==========================================================


# if output_format != "html":
#     fig.write_image(f"{output_name}.{output_format}", engine="kaleido")
# else:
#     fig.write_html(f"{output_name}.{output_format}")