In [3]:
import torch

In [4]:
import os

In [5]:
import pandas as pd
import numpy as np
import importlib

In [6]:
import utils_clusters

In [1]:
! wget https://zenodo.org/records/7253910/files/data.zip?download=1
! unzip data.zip

/oak/stanford/groups/rbaltman/alptartici/pooling_work/Pool_PaRTI/protein_tasks/EC_class_retrieval


In [7]:
base = "data/datasets/SwissProt_2021_04"
sp_train = pd.read_json(f"{base}/SwissProt_2021_04_train.json")
sp_test = pd.read_json(f"{base}/SwissProt_2021_04_test.json")
sp_valid = pd.read_json(f"{base}/SwissProt_2021_04_valid.json")

In [8]:
sp_all = pd.concat([sp_train, sp_test, sp_valid])

In [9]:
importlib.reload(utils_clusters)

<module 'utils_clusters' from '/oak/stanford/groups/rbaltman/alptartici/pooling_work/pooling_performances/dimensionality_reduction_analysis/utils_clusters.py'>

## ESM

In [10]:
all_embedded_esm = np.load('pairwise_distances/Pool_PaRTI_cosine_accessions_esm.npy')

In [11]:
df_to_embed_esm = sp_all[sp_all['id_uniprot'].isin(all_embedded_esm)].drop(["sequence"], axis=1)

In [12]:
df_to_embed_esm_nonenzyme = df_to_embed_esm[df_to_embed_esm['ec_number'] != '0.0.0.0']

In [13]:
# Split the 'ec_number' column into parts
df_to_embed_esm[['layer1', 'layer2', 'layer3', 'layer4']] = df_to_embed_esm['ec_number'].str.split('.', expand=True)

# Combine parts to form the hierarchical layers
df_to_embed_esm['layer1'] = df_to_embed_esm['layer1']  # Layer 1 is just the first part
df_to_embed_esm['layer2'] = df_to_embed_esm['layer1'] + '.' + df_to_embed_esm['layer2']  # Layer 2 is the first two parts
df_to_embed_esm['layer3'] = df_to_embed_esm['layer2'] + '.' + df_to_embed_esm['layer3']  # Layer 3 is the first three parts

# Optionally, drop the intermediate 'layer4' if it's not needed
df_to_embed_esm.drop(columns=['layer4'], inplace=True)
df_to_embed_esm=df_to_embed_esm.reset_index()
df_to_embed_esm.drop(columns=['index'], inplace=True)

In [14]:
df_to_embed_esm_nonenzyme = df_to_embed_esm[df_to_embed_esm['layer1'] != '0']

### Load embeddings

#### cosine dist

In [19]:
dists_PP_esm_cosine = np.load('pairwise_distances/Pool_PaRTI_cosine_distances_esm.npy')
accs_PP_esm_cosine = np.load('pairwise_distances/Pool_PaRTI_cosine_accessions_esm.npy')

In [20]:
dists_meanpooled_esm_cosine = np.load('pairwise_distances/mean_pooled_cosine_distances_esm.npy')
accs_meanpooled_esm_cosine = np.load('pairwise_distances/mean_pooled_cosine_accessions_esm.npy')

In [21]:
dists_maxpooled_esm_cosine = np.load('pairwise_distances/max_pooled_cosine_distances_esm.npy')
accs_maxpooled_esm_cosine = np.load('pairwise_distances/max_pooled_cosine_accessions_esm.npy')

In [22]:
dists_clspooled_esm_cosine = np.load('pairwise_distances/cls_pooled_cosine_distances_esm.npy')
accs_clspooled_esm_cosine = np.load('pairwise_distances/cls_pooled_cosine_accessions_esm.npy')

In [23]:
dists_sumpooled_esm_cosine = np.load('pairwise_distances/sum_pooled_cosine_distances_esm.npy')
accs_sumpooled_esm_cosine = np.load('pairwise_distances/sum_pooled_cosine_accessions_esm.npy')

#### euclidean

In [104]:
dists_PP_esm_euc = np.load('pairwise_distances/Pool_PaRTI_euclidean_distances_esm.npy')
accs_PP_esm_euc = np.load('pairwise_distances/Pool_PaRTI_euclidean_accessions_esm.npy')
utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_PP_esm_euc,
    accessions = accs_PP_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.00796543473673672, 'p_value': 0.00475642177110628},
 'mann_whitney_u_test': {'statistic': 3988218644.0,
  'p_value': 6.2405986580233e-24}}

In [105]:
dists_meanpooled_esm_euc = np.load('pairwise_distances/mean_pooled_euclidean_distances_esm.npy')
accs_meanpooled_esm_euc = np.load('pairwise_distances/mean_pooled_euclidean_accessions_esm.npy')

utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_meanpooled_esm_euc,
    accessions = accs_meanpooled_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.01279888676872809,
  'p_value': 1.0141933344912468e-06},
 'mann_whitney_u_test': {'statistic': 3996092399.5,
  'p_value': 4.909309734704071e-21}}

In [106]:
dists_maxpooled_esm_euc = np.load('pairwise_distances/max_pooled_euclidean_distances_esm.npy')
accs_maxpooled_esm_euc = np.load('pairwise_distances/max_pooled_euclidean_accessions_esm.npy')

utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_maxpooled_esm_euc,
    accessions = accs_maxpooled_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.004047391980872339, 'p_value': 0.2509283666810662},
 'mann_whitney_u_test': {'statistic': 3917047875.0,
  'p_value': 3.3629633381340017e-59}}

In [107]:
dists_clspooled_esm_euc = np.load('pairwise_distances/cls_pooled_euclidean_distances_esm.npy')
accs_clspooled_esm_euc = np.load('pairwise_distances/cls_pooled_euclidean_accessions_esm.npy')

utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_clspooled_esm_euc,
    accessions = accs_clspooled_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.018887096167417683,
  'p_value': 8.931261950865888e-14},
 'mann_whitney_u_test': {'statistic': 4064826437.5,
  'p_value': 0.00035401352623130915}}

In [108]:
dists_sumpooled_esm_euc = np.load('pairwise_distances/sum_pooled_euclidean_distances_esm.npy')
accs_sumpooled_esm_euc = np.load('pairwise_distances/sum_pooled_euclidean_accessions_esm.npy')

utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_sumpooled_esm_euc,
    accessions = accs_sumpooled_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.01013369272427811,
  'p_value': 0.000174459992203357},
 'mann_whitney_u_test': {'statistic': 4128100521.5,
  'p_value': 0.9817947934144263}}

#### cosine

In [114]:
utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_PP_esm_cosine,
    accessions = accs_PP_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.014809082732884393,
  'p_value': 9.469999988848272e-09},
 'mann_whitney_u_test': {'statistic': 4035844854.5,
  'p_value': 1.863173165636062e-09}}

In [115]:
utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_meanpooled_esm_cosine,
    accessions = accs_meanpooled_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.012067818797151908,
  'p_value': 4.688553915122048e-06},
 'mann_whitney_u_test': {'statistic': 3922638994.0,
  'p_value': 7.785243056013067e-56}}

In [116]:
utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_maxpooled_esm_cosine,
    accessions = accs_maxpooled_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.0041754787594322496,
  'p_value': 0.22961175968698408},
 'mann_whitney_u_test': {'statistic': 3839247941.5,
  'p_value': 1.5086981320160626e-116}}

In [117]:
utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_clspooled_esm_cosine,
    accessions = accs_clspooled_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.01890336067363607,
  'p_value': 8.48076574968867e-14},
 'mann_whitney_u_test': {'statistic': 4064242730.5,
  'p_value': 0.00029410458107124234}}

In [118]:
utils_clusters.compute_group_distance_stats(
    dist_matrix = dists_sumpooled_esm_cosine,
    accessions = accs_sumpooled_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1'
)

{'ks_test': {'statistic': 0.012067818797151908,
  'p_value': 4.688553915122048e-06},
 'mann_whitney_u_test': {'statistic': 3922638940.5,
  'p_value': 7.784674647586789e-56}}

### retrieval based

In [16]:
importlib.reload(utils_clusters)

<module 'utils_clusters' from '/oak/stanford/groups/rbaltman/alptartici/pooling_work/pooling_performances/dimensionality_reduction_analysis/utils_clusters.py'>

In [129]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_PP_esm_euc,
    accessions = accs_PP_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.7029441099543066,
  'std': 0.019982939022180957},
 'precision_at_k': {'mean': 0.4204571428571428, 'std': 0.018280713601933545}}

In [130]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_meanpooled_esm_euc,
    accessions = accs_meanpooled_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.6893161095349737,
  'std': 0.01903464330755696},
 'precision_at_k': {'mean': 0.40957714285714275, 'std': 0.01811223422362437}}

In [139]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_maxpooled_esm_euc,
    accessions = accs_maxpooled_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.7156657785538175,
  'std': 0.018330112127201707},
 'precision_at_k': {'mean': 0.42825142857142867, 'std': 0.018428783166004003}}

In [136]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_sumpooled_esm_euc,
    accessions = accs_sumpooled_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.6496641477676363,
  'std': 0.017843027471030497},
 'precision_at_k': {'mean': 0.3644228571428572, 'std': 0.014733467663376182}}

In [137]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_clspooled_esm_euc,
    accessions = accs_clspooled_esm_euc, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.660883861807517,
  'std': 0.020780751813198217},
 'precision_at_k': {'mean': 0.37394285714285735, 'std': 0.015871730741626483}}

In [24]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_PP_esm_cosine,
    accessions = accs_PP_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=200,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.7103917947175593,
  'ste': 0.00042324130165487717},
 'precision_at_k': {'mean': 0.42902857142857137, 'ste': 0.0003982459500543536}}

In [25]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_meanpooled_esm_cosine,
    accessions = accs_meanpooled_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.701105465624079,
  'ste': 0.00036738779253131027},
 'precision_at_k': {'mean': 0.41994285714285695,
  'ste': 0.00029477961328450646}}

In [26]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_maxpooled_esm_cosine,
    accessions = accs_maxpooled_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.697639670456865,
  'ste': 0.0003781639807680985},
 'precision_at_k': {'mean': 0.4206285714285714, 'ste': 0.00034584790806104935}}

In [27]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_clspooled_esm_cosine,
    accessions = accs_clspooled_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.6625415645830992,
  'ste': 0.00038029213637186677},
 'precision_at_k': {'mean': 0.3729828571428574, 'ste': 0.0003436869197038345}}

In [29]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_sumpooled_esm_cosine,
    accessions = accs_sumpooled_esm_cosine, 
    df = df_to_embed_esm_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=50,
    subsample_time=50,
    K=5
)   

{'mean_reciprocal_rank': {'mean': 0.6977423717685973,
  'ste': 0.00038020050547081177},
 'precision_at_k': {'mean': 0.42186285714285693, 'ste': 0.0003426651653001857}}

## protbert

In [38]:
all_embedded_protbert = np.load('pairwise_distances/Pool_PaRTI_cosine_accessions_protbert.npy')

In [39]:
df_to_embed_protbert = sp_all[sp_all['id_uniprot'].isin(all_embedded_protbert)].drop(["sequence"], axis=1)

In [40]:
# Split the 'ec_number' column into parts
df_to_embed_protbert[['layer1', 'layer2', 'layer3', 'layer4']] = df_to_embed_protbert['ec_number'].str.split('.', expand=True)

# Combine parts to form the hierarchical layers
df_to_embed_protbert['layer1'] = df_to_embed_protbert['layer1']  # Layer 1 is just the first part
df_to_embed_protbert['layer2'] = df_to_embed_protbert['layer1'] + '.' + df_to_embed_protbert['layer2']  # Layer 2 is the first two parts
df_to_embed_protbert['layer3'] = df_to_embed_protbert['layer2'] + '.' + df_to_embed_protbert['layer3']  # Layer 3 is the first three parts

# Optionally, drop the intermediate 'layer4' if it's not needed
df_to_embed_protbert.drop(columns=['layer4'], inplace=True)
df_to_embed_protbert=df_to_embed_protbert.reset_index()
df_to_embed_protbert.drop(columns=['index'], inplace=True)

In [41]:
df_to_embed_protbert_nonenzyme = df_to_embed_protbert[df_to_embed_protbert['layer1'] != '0']

In [32]:
dists_PP_protbert_cosine = np.load('pairwise_distances/Pool_PaRTI_cosine_distances_protbert.npy')
accs_PP_protbert_cosine = np.load('pairwise_distances/Pool_PaRTI_cosine_accessions_protbert.npy')

In [33]:
dists_meanpooled_protbert_cosine = np.load('pairwise_distances/mean_pooled_cosine_distances_protbert.npy')
accs_meanpooled_protbert_cosine = np.load('pairwise_distances/mean_pooled_cosine_accessions_protbert.npy')

In [34]:
dists_maxpooled_protbert_cosine = np.load('pairwise_distances/max_pooled_cosine_distances_protbert.npy')
accs_maxpooled_protbert_cosine = np.load('pairwise_distances/max_pooled_cosine_accessions_protbert.npy')

In [35]:
dists_sumpooled_protbert_cosine = np.load('pairwise_distances/sum_pooled_cosine_distances_protbert.npy')
accs_sumpooled_protbert_cosine = np.load('pairwise_distances/sum_pooled_cosine_accessions_protbert.npy')

In [36]:
dists_clspooled_protbert_cosine = np.load('pairwise_distances/cls_pooled_cosine_distances_protbert.npy')
accs_clspooled_protbert_cosine = np.load('pairwise_distances/cls_pooled_cosine_accessions_protbert.npy')

### retrieval

#### cosine

In [65]:
subsample_size = 50
subsample_time = 200
K = 10

In [79]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_PP_protbert_cosine,
    accessions = accs_PP_protbert_cosine, 
    df = df_to_embed_protbert_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=subsample_size,
    subsample_time=subsample_time,
    K=K
)   

{'mean_reciprocal_rank': {'mean': 0.6252062487665223,
  'ste': 0.00010019664462029776},
 'precision_at_k': {'mean': 0.27629538904899126,
  'ste': 5.4113294734687524e-05}}

In [59]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_meanpooled_protbert_cosine,
    accessions = accs_meanpooled_protbert_cosine, 
    df = df_to_embed_protbert_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=subsample_size,
    subsample_time=subsample_time,
    K=K
)   

{'mean_reciprocal_rank': {'mean': 0.6153853352897548,
  'ste': 0.00040329521026987136},
 'precision_at_k': {'mean': 0.28347550432276647,
  'ste': 0.00019885848859426116}}

In [69]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_maxpooled_protbert_cosine,
    accessions = accs_maxpooled_protbert_cosine, 
    df = df_to_embed_protbert_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=subsample_size,
    subsample_time=subsample_time,
    K=K
)   

{'mean_reciprocal_rank': {'mean': 0.6039279029547712,
  'ste': 0.00011207994938601233},
 'precision_at_k': {'mean': 0.26834726224783856, 'ste': 4.830997117093212e-05}}

In [73]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_sumpooled_protbert_cosine,
    accessions = accs_sumpooled_protbert_cosine, 
    df = df_to_embed_protbert_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=subsample_size,
    subsample_time=subsample_time,
    K=K
)   

{'mean_reciprocal_rank': {'mean': 0.6210894169292724,
  'ste': 9.144541243243598e-05},
 'precision_at_k': {'mean': 0.2849322766570605, 'ste': 5.829868213116433e-05}}

In [83]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_sumpooled_protbert_cosine,
    accessions = accs_sumpooled_protbert_cosine, 
    df = df_to_embed_protbert_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=subsample_size,
    subsample_time=subsample_time,
    K=K
)   

{'mean_reciprocal_rank': {'mean': 0.6194674124375864,
  'ste': 9.790745706374098e-05},
 'precision_at_k': {'mean': 0.28323775216138325,
  'ste': 5.1058235045332425e-05}}

In [70]:
utils_clusters.compute_subsample_metrics(
    dist_matrix = dists_clspooled_protbert_cosine,
    accessions = accs_clspooled_protbert_cosine, 
    df = df_to_embed_protbert_nonenzyme, 
    accession_col = 'id_uniprot', 
    label_col = 'layer1',
    subsample_size=subsample_size,
    subsample_time=subsample_time,
    K=K
)   

{'mean_reciprocal_rank': {'mean': 0.5685067046282655,
  'ste': 0.000102750991454458},
 'precision_at_k': {'mean': 0.2383242074927954, 'ste': 5.1103291745027234e-05}}

## list to embed

In [13]:
## protbert

In [7]:
PLM = "protbert"
pooling = "Pool_PaRTI"
path_post_pool = f"/oak/stanford/groups/rbaltman/{PLM}_embeddings/post_pooling_seq_vectors/{pooling}"
all_embedded_protbert = os.listdir(path_post_pool)
all_embedded_protbert = [file[:-3] for file in all_embedded_protbert]

In [8]:
df_to_embed_protbert = sp_all[sp_all['id_uniprot'].isin(all_embedded_protbert)].drop(["sequence"], axis=1)
df_to_embed_protbert

Unnamed: 0,id_uniprot,ec_number
109,P14060,0.0.0.0
114,P26439,0.0.0.0
144,A2CKF6,0.0.0.0
158,P82935,0.0.0.0
162,P60302,0.0.0.0
...,...,...
474919,P56556,0.0.0.0
57794,Q5W041,0.0.0.0
301902,Q96C03,0.0.0.0
478896,O70161,2.7.1.68


In [9]:
# Split the 'ec_number' column into parts
df_to_embed_protbert[['layer1', 'layer2', 'layer3', 'layer4']] = df_to_embed_protbert['ec_number'].str.split('.', expand=True)

# Combine parts to form the hierarchical layers
df_to_embed_protbert['layer1'] = df_to_embed_protbert['layer1']  # Layer 1 is just the first part
df_to_embed_protbert['layer2'] = df_to_embed_protbert['layer1'] + '.' + df_to_embed_protbert['layer2']  # Layer 2 is the first two parts
df_to_embed_protbert['layer3'] = df_to_embed_protbert['layer2'] + '.' + df_to_embed_protbert['layer3']  # Layer 3 is the first three parts

# Optionally, drop the intermediate 'layer4' if it's not needed
df_to_embed_protbert.drop(columns=['layer4'], inplace=True)
df_to_embed_protbert=df_to_embed_protbert.reset_index()
df_to_embed_protbert.drop(columns=['index'], inplace=True)

## ESM

In [36]:
PLM = "esm"
pooling = "Pool_PaRTI"
path_post_pool = f"/oak/stanford/groups/rbaltman/esm_embeddings/esm2_t33_650M_uniprot/post_pooling_seq_vectors/{pooling}"
all_embedded_esm = os.listdir(path_post_pool)
all_embedded_esm = [file[:-3] for file in all_embedded_esm]

In [37]:
df_to_embed_esm = sp_all[sp_all['id_uniprot'].isin(all_embedded_esm)].drop(["sequence"], axis=1)

In [13]:
# Split the 'ec_number' column into parts
df_to_embed_esm[['layer1', 'layer2', 'layer3', 'layer4']] = df_to_embed_esm['ec_number'].str.split('.', expand=True)

# Combine parts to form the hierarchical layers
df_to_embed_esm['layer1'] = df_to_embed_esm['layer1']  # Layer 1 is just the first part
df_to_embed_esm['layer2'] = df_to_embed_esm['layer1'] + '.' + df_to_embed_esm['layer2']  # Layer 2 is the first two parts
df_to_embed_esm['layer3'] = df_to_embed_esm['layer2'] + '.' + df_to_embed_esm['layer3']  # Layer 3 is the first three parts

# Optionally, drop the intermediate 'layer4' if it's not needed
df_to_embed_esm.drop(columns=['layer4'], inplace=True)
df_to_embed_esm=df_to_embed_esm.reset_index()
df_to_embed_esm.drop(columns=['index'], inplace=True)

In [28]:
df_to_embed_esm['layer1'].value_counts()

layer1
0    15605
2     1697
3     1197
1      448
5      137
4      120
6      108
7       59
Name: count, dtype: int64

## performance analysis

In [16]:
from utils_clusters import evaluate_embeddings_for_dist_ratio

## trying with BERT

In [20]:
metrics_l1_prmaxmax_euc = evaluate_embeddings_for_dist_ratio(df_to_embed_protbert, 'id_uniprot', 'layer1', 'protbert', 'Pool_PaRTI', sampling_mode=True, distance_metric= "euclidian")

                                                                                

In [98]:
metrics_l1_mean_euc = evaluate_embeddings(df_to_embed, 'id_uniprot', 'layer1', 'protbert', 'mean_pooled')

Processing UniProt Accessions: 100%|██████████| 19213/19213 [02:08<00:00, 149.09it/s]


Calculating Silhouette Score...
Silhouette Score: -0.1150885596871376
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.007796364261525336
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.0001778454193104412
Weighted Inter-class Mean Distance: 0.0024329938766973038


In [99]:
metrics_l1_max = evaluate_embeddings(df_to_embed, 'id_uniprot', 'layer1', 'protbert', 'max_pooled')

Processing UniProt Accessions: 100%|██████████| 19213/19213 [00:17<00:00, 1107.63it/s]


Calculating Silhouette Score...
Silhouette Score: -0.07282625883817673
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: -0.003125052867028021
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.000311986151480819
Weighted Inter-class Mean Distance: 0.004488251936060317


In [100]:
metrics_l1_cls = evaluate_embeddings(df_to_embed, 'id_uniprot', 'layer1', 'protbert', 'cls_pooled')

Processing UniProt Accessions: 100%|██████████| 19213/19213 [09:33<00:00, 33.52it/s]


Calculating Silhouette Score...
Silhouette Score: -0.14525048434734344
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: -0.0026108045793982555
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.00015752627007356094
Weighted Inter-class Mean Distance: 0.0021668486004640885


In [101]:
metrics_l2_prmaxmax = evaluate_embeddings(df_to_embed, 'id_uniprot', 'layer2', 'protbert', 'PR_max_of_max_attn_no_prune')

Processing UniProt Accessions: 100%|██████████| 19213/19213 [07:30<00:00, 42.69it/s]


Calculating Silhouette Score...
Silhouette Score: -0.3025088114148178
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: -0.0003484344281414171
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.00017623176967503164
Weighted Inter-class Mean Distance: 0.01775794779808351


In [102]:
metrics_l2_mean = evaluate_embeddings(df_to_embed, 'id_uniprot', 'layer2', 'protbert', 'mean_pooled')

Processing UniProt Accessions: 100%|██████████| 19213/19213 [00:16<00:00, 1134.41it/s]


Calculating Silhouette Score...
Silhouette Score: -0.2876802384853363
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.0007331815698318647
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.0001787787582851825
Weighted Inter-class Mean Distance: 0.018223239045291825


In [103]:
metrics_l2_max = evaluate_embeddings(df_to_embed, 'id_uniprot', 'layer2', 'protbert', 'max_pooled')

Processing UniProt Accessions: 100%|██████████| 19213/19213 [00:18<00:00, 1036.09it/s]


Calculating Silhouette Score...
Silhouette Score: -0.28426605463027954
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.0005392202852170829
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.00031365290302227294
Weighted Inter-class Mean Distance: 0.0338204741864123


In [104]:
metrics_l2_cls = evaluate_embeddings(df_to_embed, 'id_uniprot', 'layer2', 'protbert', 'cls_pooled')

Processing UniProt Accessions: 100%|██████████| 19213/19213 [00:17<00:00, 1104.05it/s]


Calculating Silhouette Score...
Silhouette Score: -0.4940852224826813
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: -0.003695595226178046
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.00015851444531434023
Weighted Inter-class Mean Distance: 0.016412951256405493


### keeping only the enzymes

In [25]:
df_to_embed_enzymes['layer1'].value_counts()

NameError: name 'df_to_embed_enzymes' is not defined

In [124]:
df_to_embed_enzymes = df_to_embed[df_to_embed['layer1']!= '0']
df_to_embed_enzymes=df_to_embed_enzymes.reset_index()
df_to_embed_enzymes.drop(columns=['index'], inplace=True)

In [126]:
metrics_l1_pp_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer1', 'protbert', 'PR_max_of_max_attn_no_prune')

Processing UniProt Accessions: 100%|██████████| 3700/3700 [00:55<00:00, 67.21it/s]  


Calculating Silhouette Score...
Silhouette Score: -0.10608668079179788
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.0020637244486957176
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.001566821876271904
Weighted Inter-class Mean Distance: 0.004480832477092202


In [137]:
dist_rat_l1_pp_enz = metrics_l1_pp_enz['Weighted Inter-class Mean Distance']/metrics_l1_pp_enz['Weighted Intra-class Mean Distance']
dist_rat_l1_pp_enz

2.8598225139375097

In [127]:
metrics_l1_mean_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer1', 'protbert', 'mean_pooled')

Processing UniProt Accessions: 100%|██████████| 3700/3700 [02:31<00:00, 24.42it/s]


Calculating Silhouette Score...
Silhouette Score: -0.10003482550382614
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.003835782351043866
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.0016223856659628927
Weighted Inter-class Mean Distance: 0.00466719811169207


In [138]:
dist_rat_l1_mean_enz = metrics_l1_mean_enz['Weighted Inter-class Mean Distance']/metrics_l1_mean_enz['Weighted Intra-class Mean Distance']
dist_rat_l1_mean_enz

2.8767500906894834

In [128]:
metrics_l1_max_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer1', 'protbert', 'max_pooled')

Processing UniProt Accessions: 100%|██████████| 3700/3700 [02:22<00:00, 26.01it/s]


Calculating Silhouette Score...
Silhouette Score: -0.07811427116394043
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.009102623737933373
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.002971625962658941
Weighted Inter-class Mean Distance: 0.008746740850246213


In [129]:
metrics_l1_cls_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer1', 'protbert', 'cls_pooled')

Processing UniProt Accessions: 100%|██████████| 3700/3700 [02:14<00:00, 27.42it/s]


Calculating Silhouette Score...
Silhouette Score: -0.12770624458789825
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.010677619740417083
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.0015396620516485574
Weighted Inter-class Mean Distance: 0.004580777054813423


#### layer 2

In [130]:
metrics_l2_pp_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer2', 'protbert', 'PR_max_of_max_attn_no_prune')

Processing UniProt Accessions: 100%|██████████| 3700/3700 [00:03<00:00, 1009.80it/s]


Calculating Silhouette Score...
Silhouette Score: -0.287084608153293
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.01960865959208915
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.004987454157133807
Weighted Inter-class Mean Distance: 0.02971157340733909


In [139]:
dist_rat_l2_pp_enz = metrics_l2_pp_enz['Weighted Inter-class Mean Distance']/metrics_l2_pp_enz['Weighted Intra-class Mean Distance']
dist_rat_l2_pp_enz

5.957262457207978

In [131]:
metrics_l2_mean_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer2', 'protbert', 'mean_pooled')

Processing UniProt Accessions: 100%|██████████| 3700/3700 [00:03<00:00, 1017.98it/s]


Calculating Silhouette Score...
Silhouette Score: -0.278257817029953
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.018933619643764978
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.005172832759429458
Weighted Inter-class Mean Distance: 0.030798427351466717


In [140]:
dist_rat_l2_mean_enz = metrics_l2_mean_enz['Weighted Inter-class Mean Distance']/metrics_l2_mean_enz['Weighted Intra-class Mean Distance']
dist_rat_l2_mean_enz

5.953880356043766

In [132]:
metrics_l2_max_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer2', 'protbert', 'max_pooled')

Processing UniProt Accessions: 100%|██████████| 3700/3700 [00:03<00:00, 1073.08it/s]


Calculating Silhouette Score...
Silhouette Score: -0.22162757813930511
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.01630447247220074
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.00949911947475776
Weighted Inter-class Mean Distance: 0.058191262052094635


In [153]:
dist_rat_l2_max_enz = metrics_l2_max_enz['Weighted Inter-class Mean Distance']/metrics_l2_max_enz['Weighted Intra-class Mean Distance']
dist_rat_l2_max_enz

6.125963801879499

In [134]:
metrics_l2_cls_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer2', 'protbert', 'cls_pooled')

Processing UniProt Accessions: 100%|██████████| 3700/3700 [00:03<00:00, 993.51it/s] 


Calculating Silhouette Score...
Silhouette Score: -0.5086291432380676
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.014638635798983788
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.0049541699029006896
Weighted Inter-class Mean Distance: 0.03043317215948624


In [154]:
dist_rat_l2_cls_enz = metrics_l2_cls_enz['Weighted Inter-class Mean Distance']/metrics_l2_cls_enz['Weighted Intra-class Mean Distance']
dist_rat_l2_cls_enz

6.142940746070796

## trying with ESM

#### layer 1

In [150]:
metrics_esm_l1_prmaxmax_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer1', 'esm', 'PR_max_of_max_attn_no_prune')
dist_rat_esm_l1_prmaxmax_enz = metrics_esm_l1_prmaxmax_enz['Weighted Inter-class Mean Distance']/metrics_esm_l1_prmaxmax_enz['Weighted Intra-class Mean Distance']
dist_rat_esm_l1_prmaxmax_enz

Processing UniProt Accessions: 100%|██████████| 3700/3700 [00:03<00:00, 1001.68it/s]


Calculating Silhouette Score...
Silhouette Score: -0.019041755610295624
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.006732600723152995
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.0029328047556619052
Weighted Inter-class Mean Distance: 0.008644180244919038


2.947410743327212

In [151]:
metrics_esm_l1_mean_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer1', 'esm', 'mean_pooled')
dist_rat_esm_l1_mean_enz = metrics_esm_l1_mean_enz['Weighted Inter-class Mean Distance']/metrics_esm_l1_mean_enz['Weighted Intra-class Mean Distance']
dist_rat_esm_l1_mean_enz

Processing UniProt Accessions: 100%|██████████| 3700/3700 [02:23<00:00, 25.86it/s]


Calculating Silhouette Score...
Silhouette Score: -0.02154502645134926
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.013298503245345988
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.0028980607993513334
Weighted Inter-class Mean Distance: 0.008530645984820841


2.943570399465131

In [160]:
metrics_esm_l1_max_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer1', 'esm', 'max_pooled')
dist_rat_esm_l1_max_enz = metrics_esm_l1_max_enz['Weighted Inter-class Mean Distance']/metrics_esm_l1_max_enz['Weighted Intra-class Mean Distance']
dist_rat_esm_l1_max_enz

Processing UniProt Accessions: 100%|██████████| 3700/3700 [00:03<00:00, 993.03it/s] 


len(embeddings) 3466
len(embeddings[-1]) 1280
len(embeddings[-2]) 1280
Calculating Silhouette Score...


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3466,) + inhomogeneous part.

In [161]:
metrics_esm_l1_cls_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer1', 'esm', 'cls_pooled')
dist_rat_esm_l1_cls_enz = metrics_esm_l1_cls_enz['Weighted Inter-class Mean Distance']/metrics_esm_l1_cls_enz['Weighted Intra-class Mean Distance']
dist_rat_esm_l1_cls_enz

Processing UniProt Accessions: 100%|██████████| 3700/3700 [02:26<00:00, 25.18it/s]


Calculating Silhouette Score...
Silhouette Score: -0.015488059259951115
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.031903513645028414
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.0015022047096274658
Weighted Inter-class Mean Distance: 0.004368458245314616


2.908031253874818

#### layer 2

In [162]:
metrics_esm_l2_prmaxmax_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer2', 'esm', 'PR_max_of_max_attn_no_prune')
dist_rat_esm_l2_prmaxmax_enz = metrics_esm_l2_prmaxmax_enz['Weighted Inter-class Mean Distance']/metrics_esm_l2_prmaxmax_enz['Weighted Intra-class Mean Distance']
dist_rat_esm_l2_prmaxmax_enz

Processing UniProt Accessions: 100%|██████████| 3700/3700 [01:10<00:00, 52.41it/s]


Calculating Silhouette Score...
Silhouette Score: -0.21647587586695907
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.0467300139769061
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.009282599005163216
Weighted Inter-class Mean Distance: 0.05769846416039439


6.215766093989522

In [163]:
metrics_esm_l2_mean_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer2', 'esm', 'mean_pooled')
dist_rat_esm_l2_mean_enz = metrics_esm_l2_mean_enz['Weighted Inter-class Mean Distance']/metrics_esm_l2_mean_enz['Weighted Intra-class Mean Distance']
dist_rat_esm_l2_mean_enz

Processing UniProt Accessions: 100%|██████████| 3700/3700 [01:00<00:00, 61.07it/s]


Calculating Silhouette Score...
Silhouette Score: -0.2386271059513092
Clustering embeddings with K-means...




Calculating Adjusted Rand Index (ARI)...
Adjusted Rand Index: 0.04893139457065701
Calculating Pairwise Distances...
Calculating Weighted Mean Distances...
Weighted Intra-class Mean Distance: 0.009241536606187937
Weighted Inter-class Mean Distance: 0.05676108437827059


6.141953096876182

In [164]:
metrics_esm_l2_max_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer2', 'esm', 'max_pooled')
dist_rat_esm_l2_max_enz = metrics_esm_l2_max_enz['Weighted Inter-class Mean Distance']/metrics_esm_l2_max_enz['Weighted Intra-class Mean Distance']
dist_rat_esm_l2_max_enz

Processing UniProt Accessions: 100%|██████████| 3700/3700 [00:03<00:00, 1017.15it/s]


len(embeddings) 3466
len(embeddings[-1]) 1280
len(embeddings[-2]) 1280
Calculating Silhouette Score...


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (3466,) + inhomogeneous part.

In [1]:
metrics_esm_l2_cls_enz = evaluate_embeddings(df_to_embed_enzymes, 'id_uniprot', 'layer2', 'esm', 'cls_pooled')
dist_rat_esm_l2_cls_enz = metrics_esm_l2_cls_enz['Weighted Inter-class Mean Distance']/metrics_esm_l2_cls_enz['Weighted Intra-class Mean Distance']
dist_rat_esm_l2_cls_enz

NameError: name 'evaluate_embeddings' is not defined

## all looped

In [43]:
PLMs = ['protbert', 'esm']
#PLMs = ['esm']
levels = ['layer1', 'layer2']
metrics = ["euclidean", "cosine"]
pooling_methods = ["PR_max_of_max_attn_no_prune", "mean_pooled", "max_pooled", "cls_pooled"]
uniprot_col = 'id_uniprot'

### with just enzymes

In [23]:
df_to_embed_protbert_enz = df_to_embed_protbert[df_to_embed_protbert['layer1'] != '0'].reset_index()

In [24]:
df_to_embed_esm_enz = df_to_embed_esm[df_to_embed_esm['layer1'] != '0'].reset_index()

In [None]:
num_iter = 20
filename = f"EC_subsampled_distance_ratios_enzymes_only_in_level1_{num_iter}iterations.txt"
#filename = 'test_esm.txt'
with open(f"output/{filename}", "w") as f:
    for PLM in PLMs:
        if PLM == 'esm':
            df_to_use = df_to_embed_esm_enz
        else:
            df_to_use = df_to_embed_protbert_enz
        for level in levels:
            for metric in metrics:
                for pooling in pooling_methods:
                    result = utils_clusters.evaluate_embeddings_for_dist_ratio(df_to_use,
                                                  uniprot_col = uniprot_col,
                                                  label_col = level,
                                                  plm_type = PLM,
                                                  pooling_method = pooling,
                                                  distance_metric=metric,
                                                  verbose=False,
                                                  sampling_mode = True,
                                                  num_iterations = num_iter,
                                                  sample_size = int(len(df_to_use)//num_iter*2),
                                                  min_samples_per_class=5
                                                 )
                    mean = result["ratio_of_inter_to_intra_mean"]
                    std = result["ratio_of_inter_to_intra_std"]
                    ratios = result["ratios"]
                    f.write(f"{PLM}\t{level}\t{metric}\t{pooling}\t{mean} +/- {std} &{ratios}\n")

Processing UniProt Accessions:  39%|███▉      | 145/371 [00:01<00:02, 84.61it/s]  

In [41]:
df_to_embed_esm_enz

Unnamed: 0,id_uniprot,ec_number,layer1,layer2,layer3
22,Q13131,2.7.11.1,2,2.7,2.7.11
23,Q5EG47,2.7.11.1,2,2.7,2.7.11
24,P54646,2.7.11.1,2,2.7,2.7.11
25,Q8BRK8,2.7.11.1,2,2.7,2.7.11
27,P08910,3.1.1.23,3,3.1,3.1.1
...,...,...,...,...,...
19335,Q8NEG5,2.3.2.27,2,2.3,2.3.2
19338,P90897,3.6.4.13,3,3.6,3.6.4
19346,Q4KMD7,2.7.7.19,2,2.7,2.7.7
19350,Q8GYW0,3.4.19.12,3,3.4,3.4.19
