In [38]:
# Update working directory

%cd /home/hugo/Bureau/PoincareMSA/

/home/hugo/Bureau/PoincareMSA


In [39]:
#Load dependencies
import os
import numpy as np
import pandas as pd
import subprocess
import json
import warnings
warnings.filterwarnings('ignore')

#Import visualization functions
from scripts.visualize_projection.pplots_new import read_embeddings, plot_embedding, plot_embedding_interactive, rotate, get_colors
from scripts.prepare_data.mmseqs2_api import run_mmseqs2
from scripts.prepare_data.uniprot_idmapping_api import submit_id_mapping, check_id_mapping_results_ready, get_id_mapping_results_link, get_id_mapping_results_search
%matplotlib inline
from matplotlib import pyplot as plt

#Create optional variables
path_annotation = ""

In [40]:
# POINCARE PARAMETERS ====================================
# Here you control different parameters of Poincar√© maps.
# In our computational experiments the best results were achieved for the following values provided by default.
# The impact of different parameters is analyzed in the original paper [1].
knn = 5
gamma = 2
sigma = 1
cospca = 0
batchs = 4
epochs = 1000
seed = 4
distance = "minkowski"  #"minkowski"  "cosine"
#==========================================================

In [41]:
path_annotation_csv = "examples/globins/globin_colors_new.csv" 
path_annotation = "auto_annot.csv"

In [42]:

if path_annotation_csv :
    if os.path.isfile(path_annotation_csv):
        try:
            df_annotation = pd.read_csv(path_annotation_csv)
        except:
            raise ValueError("Annotation file is not in .csv format.")

        #Add id column
        if "proteins_id" not in df_annotation.columns:
            df_annotation.insert(0, "proteins_id", range(len(df_annotation)))
        path_annotation = path_annotation_csv
            
        print("\nAnnotation file correctly loaded.")
        annotation_names = list(df_annotation.columns)
        print(f"{len(annotation_names)} annotations found: {annotation_names}.")
    else:
        print(f"File {path_annotation_csv} not found.")


Annotation file correctly loaded.
17 annotations found: ['proteins_id', 'tree1', 'tree2', 'tree3', 'tree4', 'full_name', 'short_name', 'full_species', 'short_species', 'evo_distance', 'Color_species', 'Domain', 'Kingdom', 'Phylum', 'Subphylum', 'Class', 'Genus'].


In [43]:
out_name_results_plm_matrix = 'RFA_matrix/globins/with_plm_embeddings/results/'
out_name_results_plm = 'results/globins/with_plm_embeddings/'
out_name_results_plm_aae_matrix = 'RFA_matrix/globins/with_aae_plm_embeddings/results/'
out_name_results_plm_aae = 'results/globins/with_aae_plm_embeddings/'
out_name_results_pssm_matrix = 'RFA_matrix/globins/with_mfasta/results/'
out_name_results_pssm = 'results/globins/with_mfasta/'

In [44]:
#Here are different labels found in your annotation file (if one uploaded):
print(f"{len(annotation_names)} annotations found: {annotation_names}.")

17 annotations found: ['proteins_id', 'tree1', 'tree2', 'tree3', 'tree4', 'full_name', 'short_name', 'full_species', 'short_species', 'evo_distance', 'Color_species', 'Domain', 'Kingdom', 'Phylum', 'Subphylum', 'Class', 'Genus'].


In [45]:
path_embedding_plm_matrix = f"{out_name_results_plm_matrix}/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}{distance}pca={cospca:1.0f}_seed{seed:1.0f}.csv"
path_embedding_plm = f"{out_name_results_plm}/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}{distance}pca={cospca:1.0f}_seed{seed:1.0f}.csv"
path_embedding_plm_aae_matrix = f"{out_name_results_plm_aae_matrix}/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}{distance}pca={cospca:1.0f}_seed{seed:1.0f}.csv"
path_embedding_plm_aae = f"{out_name_results_plm_aae}/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}{distance}pca={cospca:1.0f}_seed{seed:1.0f}.csv"
path_embedding_pssm_matrix = f"{out_name_results_pssm_matrix}/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}{distance}pca={cospca:1.0f}_seed{seed:1.0f}.csv"
path_embedding_pssm = f"{out_name_results_pssm}/PM{knn:1.0f}sigma={sigma:2.2f}gamma={gamma:2.2f}{distance}pca={cospca:1.0f}_seed{seed:1.0f}.csv"

In [46]:
df_embedding_plm_matrix = read_embeddings(path_embedding_plm_matrix, path_annotation, withroot=False)
df_embedding_plm = read_embeddings(path_embedding_plm, path_annotation, withroot=False)
df_embedding_plm_aae_matrix = read_embeddings(path_embedding_plm_aae_matrix, path_annotation, withroot=False)
df_embedding_plm_aae = read_embeddings(path_embedding_plm_aae, path_annotation, withroot=False)
df_embedding_pssm_matrix = read_embeddings(path_embedding_pssm_matrix, path_annotation, withroot=False)
df_embedding_pssm = read_embeddings(path_embedding_pssm, path_annotation, withroot=False)

In [47]:
# Construction of custom color palette 
globin_palette = {'Echinodermata': '#086b75', 'Arthropoda': '#0b237c', 'Mollusca': '#512ff8', 'Annelida': '#a191f3', 'Chordata': '#26c9d9', 'Cnidaria': '#ad288b', 'Porifera': '#fdb7fd', 'Placozoa': '#e9bd6b', 'Bacteria': '#f10000', 'Nematoda': '#5d78e3', 'Hemichordata': '#b0ffe8', 'Fungi': '#a0e361', 'Viridiplantae': '#4d9b03'}

In [48]:
# Liste des 4 DataFrames √† afficher
dfs = [df_embedding_plm_matrix, df_embedding_plm,df_embedding_plm_aae_matrix, df_embedding_plm_aae, df_embedding_pssm_matrix, df_embedding_pssm]
titles = [
    "df_embedding_plm_matrix",
    "df_embedding_plm",
    "df_embedding_plm_aae_matrix",
    "df_embedding_plm_aae",
    "df_embedding_pssm_matrix",
    "df_embedding_pssm"
]

# Select the coloring from annotation .csv file:
labels_name = "Color_species"
# Select classes to label among the "labels_name" or "second_labels_name" column (comma separated list):
second_labels_name = ""
labels_text = []
show_text = False
#----------------------------------------------------------
# Use a custom color palette:
color_palette = globin_palette #Default: None
use_custom_palette = True


#Check projection visualization parameters
#Labels name
if labels_name == "":
    labels_name = None
elif labels_name not in annotation_names:
    raise NameError(f"labels_name {labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}")
#Second labels name
if second_labels_name == "":
    second_labels_name = None
elif second_labels_name not in annotation_names:
    raise NameError(f'"second_labels_name" {second_labels_name} is not in the availables annotations.\nAvailables annotations: {annotation_names}')

if not use_custom_palette:
    color_palette = None

# Affichage s√©quentiel : chaque figure sera dans une fen√™tre Plotly s√©par√©e
for i, df in enumerate(dfs):
    title = titles[i]
    
    fig_i = plot_embedding_interactive(df, 
                                 labels_name = labels_name,
                                 second_labels_name = second_labels_name, 
                                 show_text = show_text,
                                 labels_text = labels_text,
                                 color_palette = color_palette, 
                                 title = title, 
                                 fontsize = 11)
    fig_i.show()  # Affiche la figure imm√©diatement
