[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DSIMB/PoincareMSA/blob/master/PoincareMSA_colab.ipynb)

<img src="https://github.com/DSIMB/PoincareMSA/blob/master/.github/PoincareMSA_small_logo.png?raw=true" height="100" style="height:100px;margin-left: 0px;">

# Poincaré maps for visualization of large protein famillies

**Authors**: Anna Klimovskaia Susmelj, Yani Ren, Yann Vander Meersche, Jean-Christophe Gelly and Tatiana Galochkina

PoincaréMSA builds an interactive projection of an input protein multiple sequence alignemnt (MSA) using a method based on Poincaré maps described by Klimovskaia et al [1]. It reproduces both local proximities of protein sequences and hierarchy contained in give data. Thus, sequences located closer to the center of projection correspond to the proteins sharing the most general functional properites and/or appearing at the earlier stages of evolution. Source code is available at https://github.com/DSIMB/PoincareMSA.

[1] Klimovskaia, A., Lopez-Paz, D., Bottou, L. et al. Poincaré maps for analyzing complex hierarchies in single-cell data. Nat Commun 11, 2966 (2020).

# Directory Parameters

In [1]:
# Update working directory

%cd /home/hugo/Bureau/PoincareMSA/

/home/hugo/Bureau/PoincareMSA


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
import sys

# Add the project root to Python path
project_root = "/home/hugo/Bureau/PoincareMSA"
if project_root not in sys.path:
    sys.path.append(project_root)

# Parameters

In [3]:
# GENERAL OPTIONS ==========================================
# Embedding or without
data_type = "distance_matrix" #  "pssm"   "plm"   "RFA_matrix"   "plm_aae"   "distance_matrix"
#----------------------------------------------------------
#Annotation file (.csv) or UniProt ID list. (Emtpy strings for no annotations)
path_annotation_csv = "examples/kinases/kinase_group_new.csv"   # Path or ""
# OR
path_uniprot_list = ""   # Path or ""
#==========================================================

# POINCARE PARAMETERS ====================================
# Here you control different parameters of Poincaré maps.
# In our computational experiments the best results were achieved for the following values provided by default.
# The impact of different parameters is analyzed in the original paper [1].
knn = 5
gamma = 1
sigma = 1
cospca = 0
batchs = 4
epochs = 10
seed = 4
distance = "minkowski"  #"minkowski"  "cosine"
#==========================================================

In [4]:
# OPTIONS WITH EMBEDDING ===============================
if data_type == "plm" :
    # Input the embedding folder here
    embedding_path = 'embeddings/ankh_base_kinases/'
    #--------------------------------------------------
    in_name = 'embeddings/ankh_base_kinases/'  # Input here the name of folder with the embeddings
    mid_output = 'kinases_data/with_plm_embeddings/'  # Input name of desired folder for intermediary results
    out_name_results = 'results/kinases/with_plm_embeddings/' # Input desired name of output folder
#==========================================================

In [5]:
# OPTIONS WITH EMBEDDING AUTO ENCODER ==================
if data_type == "plm_aae" :
    # Input the embedding folder here
    embedding_path = 'embeddings/aae_embeddings/ankh_base_kinases/'
    #--------------------------------------------------
    in_name = 'embeddings/aae_embeddings/ankh_base_kinases/'  # Input here the name of folder with the embeddings
    mid_output = 'kinases_data/with_aae_plm_embeddings/'  # Input name of desired folder for intermediary results
    out_name_results = 'results/kinases/with_aae_plm_embeddings/' # Input desired name of output folder
#==========================================================

In [6]:
# OPTIONS WITHOUT EMBEDDING ===============================
if data_type == "pssm" :
    # Input the mfasta file here
    mfasta = 'examples/kinases/kinases.mfasta'
    # Name for the output folder
    out_name = "kinases_data"
    #----------------------------------------------------------
    # Threshold for filtering gapped positions
    # Positions with proportion of gaps above the given threshold are removed from the alignment.
    # If your alignment is very gapped, you may want to increase this value.
    gapth = 0.9 
    #----------------------------------------------------------
    in_name = 'examples/kinases/kinases_data/fasta0.9/'  # Input here the name of folder with the fasta files
    mid_output = 'kinases_data/with_mfasta/'  # Input name of desired folder for intermediary results
    out_name_results = 'results/kinases/with_mfasta/' # Input desired name of output folder
#==========================================================

In [7]:
# OPTIONS WITH DISTANCE MATRIX ============================
if data_type == "RFA_matrix" :
    # Input the RFA matrix file here
    mid_output = 'RFA_matrix/kinases/with_plm_embeddings/'
    # Input True if the matrix comes from plm and False for pssm
    matrix_plm = True
    # Folder for the results
    out_name_results = 'RFA_matrix/kinases/with_plm_embeddings/results/' # Input desired name of output folder
#==========================================================

In [8]:
# OPTIONS WITH DISTANCE MATRIX ============================
if data_type == "distance_matrix" :
    # Input the distance matrix file here
    distance_matrix = 'distance_matrix/kinases/with_mfasta/distance_matrix.csv'
    mid_output = 'distance_matrix/kinases/with_mfasta/outputs/'
    labels = 'distance_matrix/kinases/with_mfasta/labels.csv'
    # Input True if the matrix comes from plm and False for pssm
    matrix_plm = True
    # Folder for the results
    out_name_results = 'distance_matrix/kinases/with_mfasta/results/' # Input desired name of output folder
#==========================================================

# Librairies

In [9]:
#Load dependencies
import os
import numpy as np
import pandas as pd
import subprocess
import json
import warnings
warnings.filterwarnings('ignore')

#Import visualization functions
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
from scripts.prepare_data.mmseqs2_api import run_mmseqs2
from scripts.prepare_data.uniprot_idmapping_api import submit_id_mapping, check_id_mapping_results_ready, get_id_mapping_results_link, get_id_mapping_results_search

%matplotlib inline

#Create optional variables
path_annotation = ""

from scripts.build_poincare_map.data import append_point_to_feature_and_distance
import scripts.build_poincare_map.data as data_mod
from scripts.build_poincare_map.data import append_annotation_for_new_point
from scripts.build_poincare_map.data import find_k_neighbors_for_new_point

# import importlib
# from importlib import reload
# reload(data_mod)

# Data import

In [10]:
#Check files
#mfasta
nb_seq = 0

if data_type == "pssm" :
    if os.path.isfile(mfasta):
        with open(mfasta, "r") as f_in:
            for line in f_in:
                if line[0] == ">":
                    nb_seq += 1
        print(f"\nNumber of sequences found: {nb_seq}.")
    else:
        print(f"File {mfasta} not found.")

elif data_type == "plm" or data_type == "plm_aae":
    if os.path.exists(embedding_path):
        nb_seq = len([s for s in os.listdir(embedding_path) if '.pt' in s])
        print(f"\nNumber of sequences found: {nb_seq}.")
    else:
        print(f"Folder {embedding_path} not found.")

elif data_type == "RFA_matrix" :
    print(f"RFA matrix used")

elif data_type == "Distance_matrix" :
    print(f"Distance matrix used")
    
else:
    print('Neither a valid mfasta or embedding folder has been provided')

#Check that only one path is selected
if path_annotation_csv and path_uniprot_list:
    raise ValueError("Use only one file path (path_annotation_csv OR path_uniprot_list).")

#Check that only one path is selected
if path_annotation_csv and path_uniprot_list:
    raise ValueError("Use only one file path (path_annotation_csv OR path_uniprot_list).")

if path_annotation_csv :
    if os.path.isfile(path_annotation_csv):
        try:
            df_annotation = pd.read_csv(path_annotation_csv)
        except:
            raise ValueError("Annotation file is not in .csv format.")
        else:
            if len(df_annotation) != nb_seq and (data_type != "RFA_matrix" and data_type != "distance_matrix"):
                raise ValueError("Annotation file doesn't match the .mfasta file length.")

        #Add id column
        if "proteins_id" not in df_annotation.columns:
            df_annotation.insert(0, "proteins_id", range(len(df_annotation)))
        path_annotation = path_annotation_csv
            
        print("\nAnnotation file correctly loaded.")
        annotation_names = list(df_annotation.columns)
        print(f"{len(annotation_names)} annotations found: {annotation_names}.")
    else:
        print(f"File {path_annotation_csv} not found.")

elif path_uniprot_list and (data_type != "RFA_matrix" and data_type != "distance_matrix"):
    if os.path.isfile(path_uniprot_list):
        try:
            UnP_ids = np.genfromtxt(path_uniprot_list, dtype="str")
        except:
            raise ValueError("UniProt IDs file is not in a valid format.")
        else:
            if len(UnP_ids) != nb_seq:
                raise ValueError("UniProt IDs file doesn't match the .mfasta file length.")     

            #Split UniProtKB and UniParc IDs
            uniparc_ids = []
            uniprot_ids = []
            for unp in UnP_ids:
                if len(unp) == 13 and unp[:2] == "UP":
                    uniparc_ids.append(unp)
                else:
                    uniprot_ids.append(unp)

            #Fetch UniProtKB annotations
            job_id = submit_id_mapping(
                from_db="UniProtKB_AC-ID", to_db="UniParc", ids=uniprot_ids
            )

            if check_id_mapping_results_ready(job_id):
                link = get_id_mapping_results_link(job_id)
                results = get_id_mapping_results_search(link)

            #Fetch UniParc annotations
            job_id = submit_id_mapping(
                from_db="UniParc", to_db="UniParc", ids=uniparc_ids
            )

            if check_id_mapping_results_ready(job_id):
                link = get_id_mapping_results_link(job_id)
                results2 = get_id_mapping_results_search(link)

            #Create annotation dataframe
            df_annotation = pd.DataFrame(UnP_ids, columns=["UnP_ID"])
            df_annotation[["organism", "proteinName", "taxonId", "species", "genus", \
                           "family", "order", "class", "phylum", "clade", "superkingdom"]] = ""

            #Fill the annotation DataFrame
            for dict_res in results["results"] + results2["results"]:
                try:
                    unp = dict_res["from"]
                    prot_name = dict_res["to"]["uniParcCrossReferences"][0]["proteinName"]
                    df_annotation.loc[df_annotation["UnP_ID"] == unp, "proteinName"] = prot_name
                    scientific_name = dict_res["to"]["uniParcCrossReferences"][0]["organism"]["scientificName"]
                    taxid = dict_res["to"]["uniParcCrossReferences"][0]["organism"]["taxonId"]
                    df_annotation.loc[df_annotation["UnP_ID"] == unp, "organism"] = scientific_name
                    df_annotation.loc[df_annotation["UnP_ID"] == unp, "taxonId"] = taxid
                except KeyError:
                    continue

            #Add lineage from NCBI Taxonomist
            taxon_ids = df_annotation.loc[df_annotation["taxonId"].notnull(), 'taxonId'].to_numpy()
            taxon_ids = list(set(taxon_ids))
            taxon_ids = list(map(str, taxon_ids))
            bash_command = f"ncbi-taxonomist resolve -t {','.join(taxon_ids)}"
            list_taxon = subprocess.run(bash_command, shell=True, capture_output=True, text=True).stdout.strip().split("\n")

            if list_taxon != [""]:
                for taxon in list_taxon:
                    jsonString = taxon
                    taxon_dict = json.loads(jsonString)
                    query = taxon_dict["query"]
                    for lineage in taxon_dict["lineage"]:
                        rank = lineage["rank"]
                        if rank in ["species", "genus", "family", "order", "class", "phylum", "clade", "superkingdom"]:
                            name = lineage["name"]
                            df_annotation.loc[df_annotation["taxonId"] == int(query), rank] = name

            #Add id column
            if "proteins_id" not in df_annotation.columns:
                df_annotation.insert(0, "proteins_id", range(len(df_annotation)))

            #Save annotation to csv
            path_annotation = "auto_annot.csv"
            df_annotation.to_csv(path_annotation, index=False)

            print("\nAnnotation correctly fetched.")
            annotation_names = list(df_annotation.columns)
            print(f"{len(annotation_names)} annotations found: {annotation_names}.")
    else:
        print(f"File {path_uniprot_list} not found.")
elif data_type == "RFA_matrix":
    print(f"\nDistance matrix used")
    path_annotation = "auto_annot.csv"
else:
    print("No annotation file selected.")
    path_annotation = ""

Neither a valid mfasta or embedding folder has been provided

Annotation file correctly loaded.
19 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain', 'small_cluster'].


# Projection

## Function for bash building command

In [11]:
if data_type in ["RFA_matrix", "distance_matrix"]:
        in_name = "None"
        

def build_command(data_type=data_type, project_root=project_root, in_name=in_name, out_name_results=out_name_results, 
                  mid_output=mid_output, distance=distance, gamma=gamma, cospca=cospca, epochs=epochs, seed=seed, knn=knn):

    if data_type in ["plm", "plm_aae"]:
        plm_flag = "True"
    elif data_type == "pssm":
        plm_flag = "False"
    elif data_type in ["RFA_matrix", "distance_matrix"]:
        plm_flag = matrix_plm

    if data_type != "distance_matrix":
        cmd = (
            f"PYTHONPATH={project_root}:$PYTHONPATH python scripts/build_poincare_map/main.py "
            f"--input_path {in_name} "
            f"--output_path {out_name_results} "
            f"--plm_embedding {plm_flag} "
            f"--matrices_output_path {mid_output} "
            f"--distlocal {distance} "
            f"--gamma {gamma} "
            f"--pca {cospca} "
            f"--epochs {epochs} "
            f"--seed {seed} "
            f"--knn {knn} "
            f"--method {data_type} "
        )
        
    elif data_type == "distance_matrix":
        cmd = (
            f"PYTHONPATH={project_root}:$PYTHONPATH python scripts/build_poincare_map/main.py "
            f"--input_path {in_name} "
            f"--output_path {out_name_results} "
            f"--plm_embedding {plm_flag} "
            f"--matrices_output_path {mid_output} "
            f"--distlocal {distance} "
            f"--gamma {gamma} "
            f"--pca {cospca} "
            f"--epochs {epochs} "
            f"--seed {seed} "
            f"--knn {knn} "
            f"--method {data_type} "
            f"--distance_matrix {distance_matrix} "
            f"--labels {labels} "
        )

    return cmd

## Data preparation for pssm
Here we clean the input .mfasta alignment and translate each sequence to a vector ready for projection.

In [12]:
if data_type == "pssm" : 
    # Run data preparation
    #Data preparation consists in `.mfasta` cleaning according to a gap threshold and
    #translation of each sequence to the PSSM profile.
    prep_parameters = "scripts/prepare_data" + " " + mfasta + " " + out_name + " " + out_name + " " + str(gapth)
    bash_projection = "bash scripts/prepare_data/create_projection.sh " + prep_parameters
    !{bash_projection}

## Projection using bash command

In [13]:
cmd = build_command()
print("CMD:", cmd)
os.system(cmd)

CMD: PYTHONPATH=/home/hugo/Bureau/PoincareMSA:$PYTHONPATH python scripts/build_poincare_map/main.py --input_path None --output_path distance_matrix/kinases/with_mfasta/results/ --plm_embedding True --matrices_output_path distance_matrix/kinases/with_mfasta/outputs/ --distlocal minkowski --gamma 1 --pca 0 --epochs 10 --seed 4 --knn 5 --method distance_matrix --distance_matrix distance_matrix/kinases/with_mfasta/distance_matrix.csv --labels distance_matrix/kinases/with_mfasta/labels.csv 


INFO: CUDA available: True
INFO: Random seed set as 4
INFO: KNN matrix CSV file saved to distance_matrix/kinases/with_mfasta/outputs/KNN_matrix.csv
INFO: Computing laplacian...
INFO: Laplacian computed in 0.16 sec
INFO: Computing RFA...
INFO: RFA computed in 0.01 sec
INFO: RFA matrix computed (tensor shape (497, 497))
INFO: RFA matrix CSV file saved to distance_matrix/kinases/with_mfasta/outputs/RFA_matrix.csv
INFO: KNN matrix CSV file saved to distance_matrix/kinases/with_mfasta/outputs/KNN_matrix.csv
INFO: Computing laplacian...
INFO: Laplacian computed in 0.16 sec
INFO: Computing RFA...
INFO: RFA computed in 0.01 sec
INFO: RFA matrix computed (tensor shape (497, 497))
INFO: RFA matrix CSV file saved to distance_matrix/kinases/with_mfasta/outputs/RFA_matrix.csv
INFO: Starting training...
  0%|                                                    | 0/10 [00:00<?, ?it/s]INFO: Starting training...
loss: 45.47999: 100%|███████████████████████████| 10/10 [00:02<00:00,  4.68it/s]
loss: 45.47

PM computed in 2.14 sec


INFO: 
loss = 4.548e+01
time = 0.042 min


0

## Test add point

In [14]:
# Example: append a new feature vector and update the distance matrix

# Paths
feat_path = "kinases_data/test_add_point_mfasta/features.csv"
dist_path = "kinases_data/test_add_point_mfasta/distance_matrix.csv"
# If you don't have a features.csv, read features from the mid_output used earlier
if not os.path.exists(feat_path):
    print('features file not found at', feat_path)

# Here we try to add the mean point between the first and the second point of our data
df_feat = pd.read_csv(feat_path, header=None)
new_vec = (df_feat.iloc[0].values.astype(float) + df_feat.iloc[1].values.astype(float))/2

# Prepare output filenames so originals are preserved
out_feat = feat_path.replace('.csv', '.appended.csv')
out_dist = dist_path.replace('.csv', '.appended.csv')
out_labels = None
if 'labels' in globals() and labels:
    out_labels = labels.replace('.csv', '.appended.csv')

# Call helper (this will write the updated files)
res = append_point_to_feature_and_distance(
    features_path=feat_path,
    distance_path=dist_path,
    new_feature=new_vec,
    new_id='NEW_SEQ_EXAMPLE',
    labels_path=labels if ('labels' in globals() and labels) else None,
    out_features_path=out_feat,
    out_distance_path=out_dist,
    out_labels_path=out_labels,
    metric='euclidean'
)
print('Append result:', res)

Append result: {'features_path': 'kinases_data/test_add_point_mfasta/features.appended.csv', 'distance_path': 'kinases_data/test_add_point_mfasta/distance_matrix.appended.csv', 'features_shape': (498, 4600), 'distance_shape': (498, 498), 'labels_path': 'distance_matrix/kinases/with_mfasta/labels.appended.csv', 'labels_shape': (498, 1)}


In [15]:
new_vec

array([0.11655   , 0.0263    , 0.06925   , ..., 0.0243    , 2.14199996,
       0.0734    ], shape=(4600,))

In [16]:
feat_path = "kinases_data/test_add_point_mfasta/features.csv"
labels_path = "kinases_data/test_add_point_mfasta/labels.csv"  

neighbors = find_k_neighbors_for_new_point(feat_path, new_vec, k=5, labels_path=labels_path, metric='euclidean')
print("Top-5 neighbors of new point:")
for n in neighbors:
    print(n)

Top-5 neighbors of new point:
{'index': 0, 'distance': 16.368855078361438, 'protein_id': '74'}
{'index': 1, 'distance': 16.36885507836152, 'protein_id': '358'}
{'index': 131, 'distance': 17.96526532586899, 'protein_id': '359'}
{'index': 321, 'distance': 17.977529536690408, 'protein_id': '73'}
{'index': 324, 'distance': 18.045918084359208, 'protein_id': '360'}


In [17]:
# Example: compute centroid from top-k neighbors and append to embedding CSV
import importlib
import os
import scripts.build_poincare_map.data as data_mod
importlib.reload(data_mod)
from scripts.build_poincare_map.data import append_centroid_to_embedding

emb_path = "kinases_data/test_add_point_mfasta/PM5sigma=1.00gamma=1.00minkowskipca=0_seed4.csv"
feat_path = "kinases_data/test_add_point_mfasta/features.csv"
labels_path = "kinases_data/test_add_point_mfasta/labels.csv" if os.path.exists("kinases_data/test_add_point_mfasta/labels.csv") else None

# new_vec should already be defined by previous example cells (the vector you appended)
# If not, recompute a small example as the mean of first two feature rows:
if 'new_vec' not in globals():
    import pandas as pd
    df_feat_tmp = pd.read_csv(feat_path, header=None)
    new_vec = (df_feat_tmp.iloc[0].values.astype(float) + df_feat_tmp.iloc[1].values.astype(float)) / 2

out_emb = emb_path.replace('.csv', '.centroid_appended.csv')
df_app, neighbors_used = append_centroid_to_embedding(
    embedding_path=emb_path,
    features_path=feat_path,
    new_feature=new_vec,
    k=5,
    labels_path=labels_path,
    new_id='NEW_BY_CENTROID',
    out_embedding_path=out_emb,
    metric='euclidean'
)

print('Appended embedding written to:', out_emb)
print('Last row (the centroid):')
print(df_app.tail(1).to_string(index=False))

print('Neighbors used to compute centroid:')
for n in neighbors_used:
    print(n)

Appended embedding written to: kinases_data/test_add_point_mfasta/PM5sigma=1.00gamma=1.00minkowskipca=0_seed4.centroid_appended.csv
Last row (the centroid):
     pm1      pm2     proteins_id
-0.10964 0.194583 NEW_BY_CENTROID
Neighbors used to compute centroid:
{'index': 0, 'distance': 16.368855078361438, 'protein_id': '74'}
{'index': 1, 'distance': 16.36885507836152, 'protein_id': '358'}
{'index': 131, 'distance': 17.96526532586899, 'protein_id': '359'}
{'index': 321, 'distance': 17.977529536690408, 'protein_id': '73'}
{'index': 324, 'distance': 18.045918084359208, 'protein_id': '360'}


In [19]:
ann_path = "kinases_data/test_add_point_mfasta/kinase_group_new.csv"
# Suppose new protein id assigned earlier is 'NEW_BY_CENTROID' (string) or an int
new_id = "NEW_BY_CENTROID"

df_ann_app = append_annotation_for_new_point(ann_path, new_id=new_id)
print("Wrote appended annotation; last row:")
print(df_ann_app.tail(1).to_string(index=False))

Wrote appended annotation; last row:
    proteins_id   1_Group    2_Gene    3_HGNC 4_Uni_entry 5_Uni_acc  6_Domain_begin  7_Domain_end  8_Domain_length  9_Largest_insert_length 10_PDB_validation 11_Conformational_state 12_Dihedral_state 13_Group_in_Uni 14_Group_in_Manning 15_Synonymn  evo_distance  decile_domain small_cluster
NEW_BY_CENTROID New_point New_point New_point   New_point New_point               0             0                0                        0         New_point               New_point         New_point       New_point           New_point   New_point           0.0              0     New_point


### Create interactive plot

In [20]:
# Prepare data for visualization
#Check that an annotation file was provided. Create a dummy one instead

path_annotation = "/home/hugo/Bureau/PoincareMSA/kinases_data/test_add_point_mfasta/kinase_group_new.csv"
path_embedding = "/home/hugo/Bureau/PoincareMSA/kinases_data/test_add_point_mfasta/PM5sigma=1.00gamma=1.00minkowskipca=0_seed4.csv"

df_embedding = read_embeddings(path_embedding, path_annotation, withroot=False)

path_annotation_new = "/home/hugo/Bureau/PoincareMSA/kinases_data/test_add_point_mfasta/kinase_group_new.appended.csv"
path_embedding_new = "/home/hugo/Bureau/PoincareMSA/kinases_data/test_add_point_mfasta/PM5sigma=1.00gamma=1.00minkowskipca=0_seed4.centroid_appended.csv"

df_embedding_new = read_embeddings(path_embedding_new, path_annotation_new, withroot=False)

#Here are different labels found in your annotation file (if one uploaded):
print(f"{len(annotation_names)} annotations found: {annotation_names}.")

19 annotations found: ['proteins_id', '1_Group', '2_Gene', '3_HGNC', '4_Uni_entry', '5_Uni_acc', '6_Domain_begin', '7_Domain_end', '8_Domain_length', '9_Largest_insert_length', '10_PDB_validation', '11_Conformational_state', '12_Dihedral_state', '13_Group_in_Uni', '14_Group_in_Manning', '15_Synonymn', 'evo_distance', 'decile_domain', 'small_cluster'].


In [21]:
# Construction of custom color palette 
kinase_palette = {-1 : "#000000", "OTHER": "#000000", "None" :"#000000", "NA" : "#000000", "Uncharacterized" : "#000000", "root": "#000000",
                  "TYR": "#000000", "CMGC": "#000000", "TKL": "#000000","STE": "#000000", # kinase groups 
                  "CK1": "#000000", "AGC": "#000000", "CAMK":  "#000000", "NEK": "#000000", "RGC":"#000000", "New_point":"#f59a62"}

# Projection visualization

## Visualization

In [22]:
# OPTIONS =================================================
#Here you can set different parameters to color & annotate the resulting projection:
title = "Poincaré projection of the kinase 1 protein family pssm without a new point"  
#----------------------------------------------------------
# Select the coloring from annotation .csv file:
labels_name = "1_Group"
# Select classes to label among the "labels_name" or "second_labels_name" column (comma separated list):
second_labels_name = ""
labels_text = []
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = kinase_palette #Default: None
use_custom_palette = True
#==========================================================


#Check projection visualization parameters
#Labels name
if labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")
#Second labels name
if second_labels_name == "":
    second_labels_name = None
elif second_labels_name not in annotation_names:
    raise NameError(f'"second_labels_name" {second_labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}')

if not use_custom_palette:
    color_palette = None

#Plot graph
fig = plot_embedding_interactive(df_embedding, 
                                 labels_name = labels_name,
                                 second_labels_name = second_labels_name, 
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
fig.show()

In [23]:
# OPTIONS =================================================
#Here you can set different parameters to color & annotate the resulting projection:
title = "Poincaré projection of the kinase 1 protein family pssm with a new point"  
#----------------------------------------------------------
# Select the coloring from annotation .csv file:
labels_name = "1_Group"
# Select classes to label among the "labels_name" or "second_labels_name" column (comma separated list):
second_labels_name = ""
labels_text = []
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = kinase_palette #Default: None
use_custom_palette = True
#==========================================================


#Check projection visualization parameters
#Labels name
if labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")
#Second labels name
if second_labels_name == "":
    second_labels_name = None
elif second_labels_name not in annotation_names:
    raise NameError(f'"second_labels_name" {second_labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}')

if not use_custom_palette:
    color_palette = None

#Plot graph
fig = plot_embedding_interactive(df_embedding_new, 
                                 labels_name = labels_name,
                                 second_labels_name = second_labels_name, 
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
fig.show()

In [29]:
# Highlighting: mark only the nearest neighbors (green) and the appended new point.
# Do NOT attempt to mark the original source points (to avoid fragile index/id matching).

import copy

# neighbors_used should be available from the centroid append step
(
neighbor_indices, neighbor_pids) = ([], [])
try:
    neighbor_indices = [int(n.get('index')) for n in neighbors_used if n.get('index') is not None] if 'neighbors_used' in globals() else []
    neighbor_pids = [str(n.get('protein_id')) for n in neighbors_used if n.get('protein_id') is not None] if 'neighbors_used' in globals() else []
except Exception:
    neighbor_indices = []
    neighbor_pids = []

# Prepare highlight column on both embeddings (original and new)
for df_name in ('df_embedding', 'df_embedding_new'):
    if df_name not in globals():
        continue
    df = globals()[df_name]
    # default category
    df['highlight'] = 'Other'
    # mark neighbors by index when available and valid
    if len(neighbor_indices) > 0:
        valid_idx = [i for i in neighbor_indices if i is not None and 0 <= int(i) < len(df)]
        if len(valid_idx) > 0:
            df.loc[df.index.isin(valid_idx), 'highlight'] = 'Neighbor'
    # fallback: mark neighbors by proteins_id if embedding has that column
    if 'proteins_id' in df.columns and len(neighbor_pids) > 0:
        mask2 = df['proteins_id'].astype(str).isin(neighbor_pids)
        df.loc[mask2, 'highlight'] = 'Neighbor'
    # mark new point in the augmented embedding (if present and proteins_id available)
    if df_name == 'df_embedding_new' and 'new_id' in globals() and 'proteins_id' in df.columns:
        df.loc[df['proteins_id'].astype(str) == str(new_id), 'highlight'] = 'New_point'
    # assign back to globals so downstream code sees the column
    globals()[df_name] = df

# Build a palette that includes our categories (neighbors + new point)
palette = copy.deepcopy(kinase_palette) if 'kinase_palette' in globals() else {}
palette.update({'Neighbor': 'red', 'New_point': '#f59a62'})

# Plot without the new point (df_embedding)
if 'df_embedding' in globals():
    fig = plot_embedding_interactive(df_embedding, labels_name='highlight', color_palette=palette, title='Poincaré map (without new point)', fontsize=11)
    fig.show()

# Plot with the new point (df_embedding_new)
if 'df_embedding_new' in globals():
    fig2 = plot_embedding_interactive(df_embedding_new, labels_name='highlight', color_palette=palette, title='Poincaré map (with new point)', fontsize=11)
    fig2.show()

### Save plot to file

In [24]:
# # OPTIONS =================================================
# output_name = "kinases"
# output_format = "html" #Format availables: ["png", "html", "pdf", "svg"]
# #==========================================================


# if output_format != "html":
#     fig.write_image(f"{output_name}.{output_format}", engine="kaleido")
# else:
#     fig.write_html(f"{output_name}.{output_format}")