In [40]:
import sys
import os
import json
import torch

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix, save_npz, load_npz
from scipy.stats import mannwhitneyu

In [41]:
interpro_annotations_nonzero = pd.read_csv("metadata/interpro_entry_list_mapping_nonzero.csv")
ptn_fam_nonzero_tensor = torch.load("metadata/ptn_fam_tensor_nonzero.pt")

latents_family_effect_size = torch.load("latents_family_effect_size_circuit.pt")
latents_family_pvals = torch.load("latents_family_pvals_circuit.pt")

In [42]:
print(f"Interpro Annotations: {interpro_annotations_nonzero.shape}")
print(f"ptn_fam_nonzero_tensor: {ptn_fam_nonzero_tensor.shape}")
print(f"Effect sizes: {latents_family_effect_size.shape}")
print(f"MWU pvals: {latents_family_pvals.shape}")

Interpro Annotations: (10096, 3)
ptn_fam_nonzero_tensor: torch.Size([10000, 10096])
Effect sizes: torch.Size([383, 10096, 4])
MWU pvals: torch.Size([383, 10096])


In [43]:
interpro_annotations_nonzero.head()

Unnamed: 0,ENTRY_AC,ENTRY_TYPE,ENTRY_NAME
0,IPR000126,Active_site,"Serine proteases, V8 family, serine active site"
1,IPR000169,Active_site,"Cysteine peptidase, cysteine active site"
2,IPR000189,Active_site,"Prokaryotic transglycosylase, active site"
3,IPR001252,Active_site,"Malate dehydrogenase, active site"
4,IPR001345,Active_site,"Phosphoglycerate/bisphosphoglycerate mutase, a..."


In [65]:
interpro_annotations_nonzero[interpro_annotations_nonzero['ENTRY_NAME'].str.contains("SAM")]

Unnamed: 0,ENTRY_AC,ENTRY_TYPE,ENTRY_NAME
324,IPR018063,Conserved_site,"SAM-dependent methyltransferase RsmI, conserve..."
949,IPR001678,Domain,SAM-dependent methyltransferase RsmB-F/NOP2-ty...
1287,IPR004107,Domain,"Integrase, SAM-like, N-terminal"
1545,IPR006638,Domain,"Elp3/MiaA/NifB-like, radical SAM core domain"
1601,IPR007197,Domain,Radical SAM
2467,IPR022642,Domain,"MCP methyltransferase, CheR-type, SAM-binding ..."
2714,IPR030382,Domain,SAM-dependent methyltransferase TRM5/TYW2-type
3034,IPR037635,Domain,"VTS1, SAM domain"
3247,IPR042650,Domain,"ASZ1, SAM domain"
3401,IPR047238,Domain,Ankyrin repeat and SAM domain-containing prote...


In [52]:
interpro_ids_for_manual = {"top2": ["IPR001757", # HATPase
                                    "IPR006086", # XPG-I domain
                                    "IPR001752"  # Kinesin
                                   ],
                           "metx":  [
                               "IPR029058", # Alpha/Beta hydrolase fold
                               "",
                               "",
                           ]}

In [6]:
with open("../../jatin/plm_circuits/layer_latent_dict_metx.json", 'r') as file:
    metx_latents = json.load(file)

with open("../../jatin/plm_circuits/layer_latent_dict_top2.json", 'r') as file:
    top2_latents = json.load(file)

In [7]:
offsets = {k: i*4096 for i,k in enumerate(metx_latents.keys())}
full_indices_metx = [j+offsets[layer_id] for layer_id in metx_latents.keys() for j in metx_latents[layer_id]]
full_indices_top2 = [j+offsets[layer_id] for layer_id in top2_latents.keys() for j in top2_latents[layer_id]]
latent_ids_both_proteins = list(set(full_indices_top2+full_indices_metx))
latent_ids_both_proteins.sort()
len(latent_ids_both_proteins) # yes, only 383 unique

383

In [8]:
latent_ids_both_proteins[0:10]

[100, 181, 237, 340, 443, 495, 527, 601, 794, 798]

In [9]:
n_tests = 28672*10096

In [10]:
n_tests

289472512

In [11]:
np.argmin(latents_family_pvals[0])

tensor(8760)

In [20]:
annotated_latents = []
for latent_i in range(latents_family_pvals.shape[0]):
    for family_j in range(latents_family_pvals.shape[1]):
        if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
            if latents_family_effect_size[latent_i,family_j,1] < 0.05:
                l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/ (latents_family_effect_size[latent_i,family_j,1]+0.00001))
                if l2fc >= 4:  # latents_family_effect_size[latent_i,family_j,0] > 1.05: #
                    annotated_latents += [latent_ids_both_proteins[latent_i]]
                    print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                    print(f"    Family {interpro_annotations_nonzero.iloc[family_j]["ENTRY_NAME"]}")
                    print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                    print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                    print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")
               

  l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/ (latents_family_effect_size[latent_i,family_j,1]+0.001))


Latent 1297 is sig assoc w family 3824
    Family Large ribosomal subunit protein bL36
    Mean in-family: 0.07311572134494781
    Mean out-of-family: 0.0033203079365193844
    Log2FC: 4.460792541503906
Latent 1297 is sig assoc w family 9135
    Family Large ribosomal subunit protein bL36 superfamily
    Mean in-family: 0.07311572134494781
    Mean out-of-family: 0.0033203079365193844
    Log2FC: 4.460792541503906


  print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")


Latent 3717 is sig assoc w family 128
    Family Metallothionein, vertebrate, metal binding site
    Mean in-family: 0.04005880653858185
    Mean out-of-family: 0.0012135858414694667
    Log2FC: 5.044771194458008
Latent 3717 is sig assoc w family 3724
    Family Metallothionein, vertebrate
    Mean in-family: 0.04005880653858185
    Mean out-of-family: 0.0012135858414694667
    Log2FC: 5.044771194458008
Latent 3717 is sig assoc w family 3767
    Family Protamine P1
    Mean in-family: 0.04811539128422737
    Mean out-of-family: 0.0011923930142074823
    Log2FC: 5.334566593170166
Latent 3717 is sig assoc w family 3824
    Family Large ribosomal subunit protein bL36
    Mean in-family: 0.08203448355197906
    Mean out-of-family: 0.001079520327039063
    Log2FC: 6.247768402099609
Latent 3717 is sig assoc w family 8849
    Family Metallothionein domain superfamily
    Mean in-family: 0.04005880653858185
    Mean out-of-family: 0.0012135858414694667
    Log2FC: 5.044771194458008
Latent 3717

  l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/ (latents_family_effect_size[latent_i,family_j,1]+0.001))


Latent 9274 is sig assoc w family 9559
    Family Histidine kinase/HSP90-like ATPase superfamily
    Mean in-family: 0.12935441732406616
    Mean out-of-family: 0.00605860585346818
    Log2FC: 4.416199684143066
Latent 9313 is sig assoc w family 536
    Family Short-chain dehydrogenase/reductase, conserved site
    Mean in-family: 0.7055881023406982
    Mean out-of-family: 0.035321809351444244
    Log2FC: 4.320195198059082
Latent 9313 is sig assoc w family 4311
    Family Short-chain dehydrogenase/reductase SDR
    Mean in-family: 0.6978220343589783
    Mean out-of-family: 0.0347372405230999
    Log2FC: 4.328303813934326
Latent 9396 is sig assoc w family 58
    Family ATPase, alpha/beta subunit, nucleotide-binding domain, active site
    Mean in-family: 0.2765734791755676
    Mean out-of-family: 0.004104959778487682
    Log2FC: 6.074151039123535
Latent 9396 is sig assoc w family 274
    Family DNA mismatch repair, conserved site
    Mean in-family: 1.4902369976043701
    Mean out-of-fam

In [13]:
ann_latents_list = list(set(annotated_latents))

In [14]:
len(ann_latents_list)

84

In [15]:
len(set(full_indices_metx))

273

In [16]:
len(set(full_indices_metx).intersection(ann_latents_list))

59

In [17]:
len(set(full_indices_top2))

134

In [18]:
len(set(full_indices_top2).intersection(ann_latents_list))

27

In [34]:
# For a table:
# Layer | Latent ID (#) | IntroPro ID | InterPro Name | out-of-family | in-family | Bonf p-value

In [35]:
metx_results_df_dict = {}
for latent_i in range(latents_family_pvals.shape[0]):
    for family_j in range(latents_family_pvals.shape[1]):
        if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
            if latents_family_effect_size[latent_i,family_j,1] < 0.05:
                l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/(latents_family_effect_size[latent_i,family_j,1]+0.00001))
                if l2fc >= 4:  # latents_family_effect_size[latent_i,family_j,0] > 1.05:

                    df_id = str(latent_i)+"_"+str(family_j)
                    cur_lat_id = latent_ids_both_proteins[latent_i]
                    layer_id = ((cur_lat_id // 4096)+1)*4
                    lat_id = cur_lat_id % 4096

                    case_ptn = "MetX"
                    if latent_ids_both_proteins[latent_i] in full_indices_top2:
                        case_ptn="Top2"

                    metx_results_df_dict[df_id] = {
                        "Case": case_ptn,
                        "Layer": layer_id,
                        "Latent ID": lat_id,
                        "InterPro ID": interpro_annotations_nonzero.iloc[family_j]["ENTRY_AC"],
                        "InterPro Name": interpro_annotations_nonzero.iloc[family_j]["ENTRY_NAME"],
                        "A_in": float(latents_family_effect_size[latent_i,family_j,0]),
                        "A_out": float(latents_family_effect_size[latent_i,family_j,1]),
                        "L2FC": float(l2fc),
                        "Bonferroni adj. p-value": float(latents_family_pvals[latent_i,family_j]*n_tests)
                    }

  l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/(latents_family_effect_size[latent_i,family_j,1]+0.00001))
  l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/(latents_family_effect_size[latent_i,family_j,1]+0.00001))


In [36]:
df_metx_results = pd.DataFrame.from_dict(metx_results_df_dict, orient='index')
df_metx_results.head()

Unnamed: 0,Case,Layer,Latent ID,InterPro ID,InterPro Name,A_in,A_out,L2FC,Bonferroni adj. p-value
13_3824,Top2,4,1297,IPR000473,Large ribosomal subunit protein bL36,0.073116,0.00332,4.456454,6.988324e-07
13_5069,Top2,4,1297,IPR005996,"Large ribosomal subunit protein uL30, bacteria...",0.057346,0.003392,4.075256,7.427055e-05
13_9135,Top2,4,1297,IPR035977,Large ribosomal subunit protein bL36 superfamily,0.073116,0.00332,4.456454,6.988324e-07
46_128,Top2,4,3717,IPR018064,"Metallothionein, vertebrate, metal binding site",0.040059,0.001214,5.032932,5.300378e-05
46_540,Top2,4,3717,IPR020939,"Large ribosomal subunit protein bL34, conserve...",0.019646,0.001192,4.030718,6.362296e-09


In [38]:
df_metx_results.query("Case == 'MetX' and L2FC > 5 and A_in >= 0.5")#.sort_values(by="Bonferroni adj. p-value")

Unnamed: 0,Case,Layer,Latent ID,InterPro ID,InterPro Name,A_in,A_out,L2FC,Bonferroni adj. p-value
80_659,MetX,12,2112,IPR000073,Alpha/beta hydrolase fold-1,0.604529,0.007763,6.281209,3.070172e-14
80_9059,MetX,12,2112,IPR029058,Alpha/Beta hydrolase fold,0.559808,0.005224,6.74091,0.0
82_5515,MetX,12,2302,IPR010084,Beta-hydroxyacyl-(acyl-carrier-protein) dehydr...,0.549541,0.002915,7.553611,0.0002303711
82_5813,MetX,12,2302,IPR013114,"Beta-hydroxydecanoyl thiol ester dehydrase, Fa...",0.565334,0.00273,7.688618,8.962103e-08
136_659,MetX,16,1504,IPR000073,Alpha/beta hydrolase fold-1,1.283272,0.026213,5.612868,8.708278e-06
136_9059,MetX,16,1504,IPR029058,Alpha/Beta hydrolase fold,1.323345,0.020013,6.046389,0.0
145_9059,MetX,16,2175,IPR029058,Alpha/Beta hydrolase fold,1.561086,0.022779,6.098062,0.0
170_4594,MetX,16,3765,IPR003824,Undecaprenyl-diphosphatase UppP,1.132152,0.032733,5.111751,2.038676e-06
177_797,MetX,20,142,IPR000873,AMP-dependent synthetase/ligase domain,2.935064,0.044304,6.04948,0.01535564
177_2169,MetX,20,142,IPR015590,Aldehyde dehydrogenase domain,2.456921,0.043235,5.828182,1.386195e-07


In [31]:
df_metx_results.query("Case == 'Top2'")#.sort_values(by="Bonferroni adj. p-value")

Unnamed: 0,Case,Layer,Latent ID,InterPro ID,InterPro Name,Activation w/ domain,Activation w/o domain,L2FC,Bonferroni adj. p-value
46_9135,Top2,4,3717,IPR035977,Large ribosomal subunit protein bL36 superfamily,0.082034,0.001080,6.234466,0.000000
46_3824,Top2,4,3717,IPR000473,Large ribosomal subunit protein bL36,0.082034,0.001080,6.234466,0.000000
72_781,Top2,12,1204,IPR000793,"ATP synthase, alpha subunit, C-terminal",0.297229,0.004520,6.036056,0.000000
72_672,Top2,12,1204,IPR000194,"ATPase, F1/V1/A1 complex, alpha/beta subunit, ...",0.276573,0.004105,6.070641,0.000000
72_58,Top2,12,1204,IPR020003,"ATPase, alpha/beta subunit, nucleotide-binding...",0.276573,0.004105,6.070641,0.000000
...,...,...,...,...,...,...,...,...,...
214_9912,Top2,20,1966,IPR042221,"Leucyl/phenylalanyl-tRNA-protein transferase, ...",6.532168,0.023353,8.127212,0.021919
214_1058,Top2,20,1966,IPR002500,Phosphoadenosine phosphosulphate reductase domain,1.451269,0.026912,5.752396,0.028800
84_669,Top2,12,2474,IPR000182,GNAT domain,1.381740,0.022060,5.968279,0.034206
214_1909,Top2,20,1966,IPR012795,"tRNA(Ile)-lysidine synthase, N-terminal",0.617451,0.027496,4.488513,0.042795


In [39]:
df_metx_results.to_csv('circuit_domain_detectors.csv', index=False)

In [40]:
print(df_metx_results.query("Case == 'Top2'").drop("Case", axis=1).to_latex(
    index=False, float_format="{:.2g}".format))

\begin{tabular}{rrllrrr}
\toprule
Layer & Latent ID & InterPro ID & InterPro Name & Activation w/ domain & Activation w/o domain & Bonferroni adj. p-value \\
\midrule
12 & 1204 & IPR014762 & DNA mismatch repair, conserved site & 1.5 & 0.0045 & 1.6e-07 \\
12 & 1204 & IPR019805 & Heat shock protein Hsp90, conserved site & 1.5 & 0.0042 & 2.9e-13 \\
12 & 1204 & IPR013507 & DNA mismatch repair protein, S5 domain 2-like & 1.5 & 0.0045 & 1.6e-07 \\
12 & 1204 & IPR014790 & MutL, C-terminal, dimerisation & 1.5 & 0.0045 & 1.6e-07 \\
12 & 1204 & IPR020575 & Heat shock protein Hsp90, N-terminal & 1.4 & 0.0041 & 3.9e-16 \\
12 & 1204 & IPR001404 & Heat shock protein Hsp90 family & 1.4 & 0.0041 & 3.9e-16 \\
12 & 1204 & IPR002099 & DNA mismatch repair protein MutL/Mlh/PMS & 1.5 & 0.0045 & 1.6e-07 \\
12 & 1204 & IPR020667 & DNA mismatch repair protein, MutL & 1.5 & 0.0045 & 1.6e-07 \\
12 & 1204 & IPR038973 & DNA mismatch repair protein MutL/Mlh/Pms-like & 1.5 & 0.0045 & 1.6e-07 \\
12 & 1204 & IPR037196

In [41]:
print(df_metx_results.query("Case == 'MetX'").drop("Case", axis=1).to_latex(
    index=False, float_format="{:.2g}".format))

\begin{tabular}{rrllrrr}
\toprule
Layer & Latent ID & InterPro ID & InterPro Name & Activation w/ domain & Activation w/o domain & Bonferroni adj. p-value \\
\midrule
16 & 1504 & IPR000073 & Alpha/beta hydrolase fold-1 & 1.3 & 0.026 & 8.7e-06 \\
16 & 1504 & IPR029058 & Alpha/Beta hydrolase fold & 1.3 & 0.02 & 0 \\
16 & 2175 & IPR029058 & Alpha/Beta hydrolase fold & 1.6 & 0.023 & 0 \\
16 & 2836 & IPR029058 & Alpha/Beta hydrolase fold & 1.3 & 0.045 & 3e-35 \\
16 & 3765 & IPR003824 & Undecaprenyl-diphosphatase UppP & 1.1 & 0.033 & 2e-06 \\
20 & 142 & IPR000873 & AMP-dependent synthetase/ligase domain & 2.9 & 0.044 & 0.015 \\
20 & 142 & IPR015590 & Aldehyde dehydrogenase domain & 2.5 & 0.043 & 1.4e-07 \\
20 & 142 & IPR016161 & Aldehyde/histidinol dehydrogenase & 2.5 & 0.042 & 1.7e-10 \\
20 & 142 & IPR016162 & Aldehyde dehydrogenase, N-terminal & 2.5 & 0.043 & 1.4e-07 \\
20 & 142 & IPR016163 & Aldehyde dehydrogenase, C-terminal & 2.5 & 0.043 & 1.4e-07 \\
20 & 142 & IPR045851 & AMP-binding e