In [1]:
import sys
import os
import json
import torch

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix, save_npz, load_npz
from scipy.stats import mannwhitneyu

In [2]:
base_path = '/project/pi_annagreen_umass_edu/bryn/plm_circuit_enrichment/'

In [3]:
interpro_annotations_nonzero = pd.read_csv(base_path + "metadata/interpro_entry_list_mapping_nonzero.csv")
ptn_fam_nonzero_tensor = torch.load(base_path + "metadata/ptn_fam_tensor_nonzero.pt")

latents_family_effect_size = torch.load(base_path + "latents_family_effect_size_circuit.pt")
latents_family_pvals = torch.load(base_path + "latents_family_pvals_circuit.pt")

  ptn_fam_nonzero_tensor = torch.load(base_path + "metadata/ptn_fam_tensor_nonzero.pt")
  latents_family_effect_size = torch.load(base_path + "latents_family_effect_size_circuit.pt")
  latents_family_pvals = torch.load(base_path + "latents_family_pvals_circuit.pt")


In [4]:
print(f"Interpro Annotations: {interpro_annotations_nonzero.shape}")
print(f"ptn_fam_nonzero_tensor: {ptn_fam_nonzero_tensor.shape}")
print(f"Effect sizes: {latents_family_effect_size.shape}")
print(f"MWU pvals: {latents_family_pvals.shape}")

Interpro Annotations: (10096, 3)
ptn_fam_nonzero_tensor: torch.Size([10000, 10096])
Effect sizes: torch.Size([383, 10096, 4])
MWU pvals: torch.Size([383, 10096])


In [5]:
interpro_annotations_nonzero.head()

Unnamed: 0,ENTRY_AC,ENTRY_TYPE,ENTRY_NAME
0,IPR000126,Active_site,"Serine proteases, V8 family, serine active site"
1,IPR000169,Active_site,"Cysteine peptidase, cysteine active site"
2,IPR000189,Active_site,"Prokaryotic transglycosylase, active site"
3,IPR001252,Active_site,"Malate dehydrogenase, active site"
4,IPR001345,Active_site,"Phosphoglycerate/bisphosphoglycerate mutase, a..."


In [6]:
with open(base_path + "../../jatin/plm_circuits/layer_latent_dict_metx.json", 'r') as file:
    metx_latents = json.load(file)

with open(base_path + "../../jatin/plm_circuits/layer_latent_dict_top2.json", 'r') as file:
    top2_latents = json.load(file)

In [14]:
offsets = {k: i*4096 for i,k in enumerate(metx_latents.keys())}
full_indices_metx = [j+offsets[layer_id] for layer_id in metx_latents.keys() for j in metx_latents[layer_id]]
full_indices_top2 = [j+offsets[layer_id] for layer_id in top2_latents.keys() for j in top2_latents[layer_id]]
latent_ids_both_proteins = list(set(full_indices_top2+full_indices_metx))
latent_ids_both_proteins.sort()
len(latent_ids_both_proteins) # yes, only 383 unique

383

In [18]:
n_tests = 28672*10096
epsilon = 1e-4
annotated_latents = []
for latent_i in range(latents_family_pvals.shape[0]):
    for family_j in range(latents_family_pvals.shape[1]):
        if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
            if latents_family_effect_size[latent_i,family_j,1] < 0.05:
                l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/ (latents_family_effect_size[latent_i,family_j,1]+epsilon))
                if l2fc >= 4:  # latents_family_effect_size[latent_i,family_j,0] > 1.05: #
                    annotated_latents += [latent_ids_both_proteins[latent_i]]
                    # print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                    # print(f"    Family {interpro_annotations_nonzero.iloc[family_j]["ENTRY_NAME"]}")
                    # print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                    # print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                    # print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]+epsilon/latents_family_effect_size[latent_i,family_j,1]+epsilon)}")
               

In [9]:
ann_latents_list = list(set(annotated_latents))

In [20]:
metx_results_df_dict = {}
for latent_i in range(latents_family_pvals.shape[0]):
    for family_j in range(latents_family_pvals.shape[1]):
        if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
            if latents_family_effect_size[latent_i,family_j,1] < 0.05:
                ratio = (latents_family_effect_size[latent_i,family_j,0]+epsilon)/ (latents_family_effect_size[latent_i,family_j,1]+epsilon)
                l2fc = np.log2(ratio)
                if l2fc >= 4:  # latents_family_effect_size[latent_i,family_j,0] > 1.05:

                    df_id = str(latent_i)+"_"+str(family_j)
                    cur_lat_id = latent_ids_both_proteins[latent_i]
                    layer_id = ((cur_lat_id // 4096)+1)*4
                    lat_id = cur_lat_id % 4096

                    case_ptn = "MetX"
                    if latent_ids_both_proteins[latent_i] in full_indices_top2:
                        case_ptn="Top2"

                    metx_results_df_dict[df_id] = {
                        "Case": case_ptn,
                        "Layer": layer_id,
                        "Latent ID": lat_id,
                        "InterPro ID": interpro_annotations_nonzero.iloc[family_j]["ENTRY_AC"],
                        "InterPro Name": interpro_annotations_nonzero.iloc[family_j]["ENTRY_NAME"],
                        "A_in": float(latents_family_effect_size[latent_i,family_j,0]),
                        "A_out": float(latents_family_effect_size[latent_i,family_j,1]),
                        "L2FC": float(l2fc),
                        "Ratio": float(ratio),
                        "Bonferroni adj. p-value": float(latents_family_pvals[latent_i,family_j]*n_tests)
                    }

In [21]:
df_metx_results = pd.DataFrame.from_dict(metx_results_df_dict, orient='index')
df_metx_results.head()

Unnamed: 0,Case,Layer,Latent ID,InterPro ID,InterPro Name,A_in,A_out,L2FC,Ratio,Bonferroni adj. p-value
13_3824,Top2,4,1297,IPR000473,Large ribosomal subunit protein bL36,0.073116,0.00332,4.419955,21.406178,6.988324e-07
13_5069,Top2,4,1297,IPR005996,"Large ribosomal subunit protein uL30, bacteria...",0.057346,0.003392,4.040099,16.450954,7.427055e-05
13_9135,Top2,4,1297,IPR035977,Large ribosomal subunit protein bL36 superfamily,0.073116,0.00332,4.419955,21.406178,6.988324e-07
46_128,Top2,4,3717,IPR018064,"Metallothionein, vertebrate, metal binding site",0.040059,0.001214,4.934134,30.571896,5.300378e-05
46_3724,Top2,4,3717,IPR000006,"Metallothionein, vertebrate",0.040059,0.001214,4.934134,30.571896,5.300378e-05


In [29]:
df_metx_results.query("Case == 'Top2' and L2FC > 6") # and A_in >= 0.1")#.sort_values(by="Bonferroni adj. p-value")

Unnamed: 0,Case,Layer,Latent ID,InterPro ID,InterPro Name,A_in,A_out,L2FC,Ratio,Bonferroni adj. p-value
46_3824,Top2,4,3717,IPR000473,Large ribosomal subunit protein bL36,0.082034,0.00108,6.121716,69.633804,0.0
46_9135,Top2,4,3717,IPR035977,Large ribosomal subunit protein bL36 superfamily,0.082034,0.00108,6.121716,69.633804,0.0
72_58,Top2,12,1204,IPR020003,"ATPase, alpha/beta subunit, nucleotide-binding...",0.276573,0.004105,6.039948,65.796936,0.0
72_274,Top2,12,1204,IPR014762,"DNA mismatch repair, conserved site",1.490237,0.004479,8.346327,325.457825,1.594212e-07
72_457,Top2,12,1204,IPR019805,"Heat shock protein Hsp90, conserved site",1.45594,0.004206,8.401531,338.15271,2.943538e-13
72_672,Top2,12,1204,IPR000194,"ATPase, F1/V1/A1 complex, alpha/beta subunit, ...",0.276573,0.004105,6.039948,65.796936,0.0
72_781,Top2,12,1204,IPR000793,"ATP synthase, alpha subunit, C-terminal",0.297229,0.00452,6.008157,64.362862,0.0
72_1283,Top2,12,1204,IPR004100,"ATPase, F1/V1/A1 complex, alpha/beta subunit, ...",0.275467,0.004137,6.023296,65.04184,0.0
72_2000,Top2,12,1204,IPR013507,"DNA mismatch repair protein, S5 domain 2-like",1.490237,0.004479,8.346327,325.457825,1.594212e-07
72_2113,Top2,12,1204,IPR014790,"MutL, C-terminal, dimerisation",1.490237,0.004479,8.346327,325.457825,1.594212e-07


In [30]:
print(df_metx_results.query("Case == 'Top2' and L2FC > 6").drop("Case", axis=1).to_latex(
    index=False, float_format="{:.2g}".format))

\begin{tabular}{rrllrrrrr}
\toprule
Layer & Latent ID & InterPro ID & InterPro Name & A_in & A_out & L2FC & Ratio & Bonferroni adj. p-value \\
\midrule
4 & 3717 & IPR000473 & Large ribosomal subunit protein bL36 & 0.082 & 0.0011 & 6.1 & 70 & 0 \\
4 & 3717 & IPR035977 & Large ribosomal subunit protein bL36 superfamily & 0.082 & 0.0011 & 6.1 & 70 & 0 \\
12 & 1204 & IPR020003 & ATPase, alpha/beta subunit, nucleotide-binding domain, active site & 0.28 & 0.0041 & 6 & 66 & 0 \\
12 & 1204 & IPR014762 & DNA mismatch repair, conserved site & 1.5 & 0.0045 & 8.3 & 3.3e+02 & 1.6e-07 \\
12 & 1204 & IPR019805 & Heat shock protein Hsp90, conserved site & 1.5 & 0.0042 & 8.4 & 3.4e+02 & 2.9e-13 \\
12 & 1204 & IPR000194 & ATPase, F1/V1/A1 complex, alpha/beta subunit, nucleotide-binding domain & 0.28 & 0.0041 & 6 & 66 & 0 \\
12 & 1204 & IPR000793 & ATP synthase, alpha subunit, C-terminal & 0.3 & 0.0045 & 6 & 64 & 0 \\
12 & 1204 & IPR004100 & ATPase, F1/V1/A1 complex, alpha/beta subunit, N-terminal domain

In [35]:
latex = (
    df_metx_results.query("Case == 'Top2' and A_out < 0.05 and A_in > 1.05")
    .drop(columns=["Case"])
    .to_latex(index=False, float_format="{:.2g}".format, escape=False)
)
latex = (
    r"\setlength{\tabcolsep}{4pt}\small" "\n"
    r"\resizebox{\linewidth}{!}{%" "\n" + latex + "\n}"
)
print(latex)


\setlength{\tabcolsep}{4pt}\small
\resizebox{\linewidth}{!}{%
\begin{tabular}{rrllrrrrr}
\toprule
Layer & Latent ID & InterPro ID & InterPro Name & A_in & A_out & L2FC & Ratio & Bonferroni adj. p-value \\
\midrule
12 & 1204 & IPR014762 & DNA mismatch repair, conserved site & 1.5 & 0.0045 & 8.3 & 3.3e+02 & 1.6e-07 \\
12 & 1204 & IPR019805 & Heat shock protein Hsp90, conserved site & 1.5 & 0.0042 & 8.4 & 3.4e+02 & 2.9e-13 \\
12 & 1204 & IPR013507 & DNA mismatch repair protein, S5 domain 2-like & 1.5 & 0.0045 & 8.3 & 3.3e+02 & 1.6e-07 \\
12 & 1204 & IPR014790 & MutL, C-terminal, dimerisation & 1.5 & 0.0045 & 8.3 & 3.3e+02 & 1.6e-07 \\
12 & 1204 & IPR020575 & Heat shock protein Hsp90, N-terminal & 1.4 & 0.0041 & 8.4 & 3.4e+02 & 3.9e-16 \\
12 & 1204 & IPR001404 & Heat shock protein Hsp90 family & 1.4 & 0.0041 & 8.4 & 3.4e+02 & 3.9e-16 \\
12 & 1204 & IPR002099 & DNA mismatch repair protein MutL/Mlh/PMS & 1.5 & 0.0045 & 8.3 & 3.3e+02 & 1.6e-07 \\
12 & 1204 & IPR020667 & DNA mismatch repair pr

In [43]:

import numpy as np
import pandas as pd

def _normalize_pairs(layer_latents):
    """Accept {layer:[latents]} or [(layer, latent), ...] and return list of (layer, latent)."""
    if isinstance(layer_latents, dict):
        pairs = []
        for layer, lat_list in layer_latents.items():
            for lat in lat_list:
                pairs.append((int(layer), int(lat)))
        return pairs
    # assume iterable of tuples
    return [(int(l), int(z)) for (l, z) in layer_latents]

def get_top_interpro_for_latents(
    layer_latents,
    top_k=5,
    alpha=0.05,
    l2fc_min=1.0,
    aout_max=0.05,
    epsilon=1e-4,
    n_total_latents_for_bonferroni=28672  # matches your earlier n_tests = 28672 * 10096
):
    """
    For each (layer, latent) requested, return top-k InterPro annotations that pass:
      p < alpha / (n_total_latents_for_bonferroni * N_families), A_out < aout_max, L2FC >= l2fc_min.
    Two rankings per latent: by L2FC and by A_in.
    """
    pairs = _normalize_pairs(layer_latents)

    # Precompute constants/sets
    N_fams = latents_family_pvals.shape[1]
    n_tests = n_total_latents_for_bonferroni * N_fams
    bonf_thr = alpha / n_tests

    set_top2 = set(full_indices_top2)
    set_metx = set(full_indices_metx)

    # Consistent column order
    cols = [
        "Case", "Layer", "Latent ID", "InterPro ID", "InterPro Name",
        "A_in", "A_out", "L2FC", "Ratio", "Bonferroni adj. p-value"
    ]

    out = {}  # (layer, latent) -> {"by_l2fc": df, "by_ain": df}

    for layer, latent in pairs:
        flat_ind = offsets[str(layer)] + latent
        if flat_ind not in latent_ids_both_proteins:
            # Not in your 383-index universe—return empty tables
            out[(layer, latent)] = {
                "by_l2fc": pd.DataFrame(columns=cols),
                "by_ain": pd.DataFrame(columns=cols),
            }
            continue

        latent_i = latent_ids_both_proteins.index(flat_ind)
        rows = []

        for fam_j in range(N_fams):
            p = float(latents_family_pvals[latent_i, fam_j])
            if p >= bonf_thr:
                continue

            ain = float(latents_family_effect_size[latent_i, fam_j, 0])
            aout = float(latents_family_effect_size[latent_i, fam_j, 1])

            if aout_max is not None and not (aout < aout_max):
                continue

            ratio = (ain + epsilon) / (aout + epsilon)
            l2fc = float(np.log2(ratio))
            if l2fc_min is not None and not (l2fc >= l2fc_min):
                continue

            # Case label like your earlier code (Top2 if present there; otherwise MetX)
            if flat_ind in set_top2:
                case = "Top2"
            elif flat_ind in set_metx:
                case = "MetX"
            else:
                case = "Unknown"

            rows.append({
                "Case": case,
                "Layer": layer,
                "Latent ID": latent,
                "InterPro ID": interpro_annotations_nonzero.iloc[fam_j]["ENTRY_AC"],
                "InterPro Name": interpro_annotations_nonzero.iloc[fam_j]["ENTRY_NAME"],
                "A_in": ain,
                "A_out": aout,
                "L2FC": l2fc,
                "Ratio": float(ratio),
                "Bonferroni adj. p-value": float(p * n_tests),
            })

        df_all = pd.DataFrame(rows, columns=cols)

        if df_all.empty:
            out[(layer, latent)] = {
                "by_l2fc": df_all.copy(),
                "by_ain": df_all.copy(),
            }
        else:
            out[(layer, latent)] = {
                "by_l2fc": df_all.sort_values("L2FC", ascending=False).head(top_k).reset_index(drop=True),
                "by_ain": df_all.sort_values("A_in", ascending=False).head(top_k).reset_index(drop=True),
            }
    return out

# --- Example usage ---
# layer_latents = {12: [1256, 1204], 16: [631]}
# results = get_top_interpro_for_latents(layer_latents, top_k=5)
# for (layer, latent), tabs in results.items():
#     print(f"\nLayer {layer}, Latent {latent} — top by L2FC")
#     print(tabs["by_l2fc"].to_latex(index=False, float_format="{:.2g}".format, escape=False))
#     print(f"\nLayer {layer}, Latent {latent} — top by A_in")
#     print(tabs["by_ain"].to_latex(index=False, float_format="{:.2g}".format, escape=False))


In [40]:
feature_clusters = {
    # Layer 4 clusters
    4: {
        "direct_motif_detectors": {340: "FX'", 237: "FX'", 3788: "X'XI", 798: "D'XXGN", 1690: "X'XXF"},  # Example: specific (token, latent) pairs
        "indirect_motif_detectors": {2277: "G", 2311: "X'XM", 3634: "X'XXXG", 1682: "PXXXXXX'", 3326: "H"},
        "motif_detectors": {340: "FX'", 237: "FX'", 3788: "X'XI", 798: "D'XXGN", 1690: "X'XXF", 2277: "G", 2311: "X'XM", 3634: "X'XXXG", 1682: "PXXXXXX'", 3326: "H"}, 
    },
    
    # # Layer 8 clusters  
    8: {
        "annotated_domain_detector": {488:"AB_Hydrolase_fold"}, #, 2693: "AB_Hydrolase_fold", 1244:"AB_Hydrolase_fold"},
        "misc_domain_detector": {2677:"FAD/NAD", 2775:"Transketolase", 2166:"DHFR"},
        "motif_detectors": {488:"AB_Hydrolase_fold", 2677:"FAD/NAD", 2775:"Transketolase", 2166:"DHFR"}, #, 2693: "AB_Hydrolase_fold", 1244:"AB_Hydrolase_fold"},
    },
    
    # # Layer 12 clusters
    12: {
        "annotated_domain_detector": {2112: "AB_Hydrolase_fold"},
        "misc_domain_detector": {3536:"SAM_mtases", 1256: "FAM", 2797: "Aldolase", 3794: "SAM_mtases", 3035: "WD40"},
        "motif_detectors": {2112: "AB_Hydrolase_fold", 3536:"SAM_mtases", 1256: "FAM", 2797: "Aldolase", 3794: "SAM_mtases", 3035: "WD40"},
    },
    # Add more layers and clusters as needed...
}

feature_clusters_1pvg = {
    4: {
        "direct_motif_detectors": {1509:"E", 2511:"X'XQ", 2112:"YXX'", 3069: "GX'", 3544: "C", 2929: "N"},
        "indirect_motif_detectors": {3170: "X'N", 3717:"V", 527: "DX'", 3229: "IXX'", 1297: "I", 1468: "X'XXN", 1196: "D"},
        "motif_detectors": {1509:"E", 2511:"X'XQ", 2112:"YXX'", 3069: "GX'", 3544: "C", 2929: "N", 3170: "X'N", 3717:"V", 527: "DX'", 3229: "IXX'", 1297: "I", 1468: "X'XXN", 1196: "D"}
    },
    8: {
        # "direct_motif_detectors": {},
        # "indirect_motif_detectors": {1916: "NX'XXNA"},
        # "motif_detectors": {}, 
        "annotated_domain_detector": {2529:"Hatpase_C", 3159: "Hatpase_C", 3903: "Hatpase_C", 1055: "Hatpase_C", 2066: "Hatpase_C"},
    },
    12: {
        "annotated_domain_detector": {3943: "Hatpase_C", 1796: "Hatpase_C", 1204: "Hatpase_C", 1145:  "Hatpase_C"},
        "misc_domain_detector": {1082: "XPG-I", 2472: "Kinesin"},
        "domain_detectors": {3943: "Hatpase_C", 1796: "Hatpase_C", 1204: "Hatpase_C", 1145:  "Hatpase_C", 1082: "XPG-I", 2472: "Kinesin"},
    },
    16: {
        "annotated_domain_detector": {3077: "Hatpase_C", 1353: "Hatpase_C", 1597: "Hatpase_C", 1814: "Hatpase_C", 3994: "Ribosomal", 1166: "Hatpase_C"},
        # "misc_domain_detector": {},
        # "domain_detectors": {3077: "Hatpase_C", 1353: "Hatpase_C", 1597: "Hatpase_C", 1814: "Hatpase_C", 3994: "Ribosomal", 1166: "Hatpase_C"},
    }
}

In [None]:
feature_clusters_1pvg_onelevel = {    4: {1509:"E", 2511:"X'XQ", 2112:"YXX'", 3069: "GX'", 3544: "C", 2929: "N", 3170: "X'N", 3717:"V", 527: "DX'", 3229: "IXX'", 1297: "I", 1468: "X'XXN", 1196: "D"},
    8: {
        2529:"Hatpase_C", 3159: "Hatpase_C", 3903: "Hatpase_C", 1055: "Hatpase_C", 2066: "Hatpase_C"
    },
    12: {3943: "Hatpase_C", 1796: "Hatpase_C", 1204: "Hatpase_C", 1145:  "Hatpase_C", 1082: "XPG-I", 2472: "Kinesin"
    },
    16: {3077: "Hatpase_C", 1353: "Hatpase_C", 1597: "Hatpase_C", 1814: "Hatpase_C", 3994: "Ribosomal", 1166: "Hatpase_C"}}

feature_clusters_2b61_onelevel = {4: {340: "FX'", 237: "FX'", 3788: "X'XI", 798: "D'XXGN", 1690: "X'XXF", 2277: "G", 2311: "X'XM", 3634: "X'XXXG", 1682: "PXXXXXX'", 3326: "H"
    },
    8: {
        488:"AB_Hydrolase_fold", 2677:"FAD/NAD", 2775:"Transketolase", 2166:"DHFR"
    },
    12: {
        2112: "AB_Hydrolase_fold", 3536:"SAM_mtases", 1256: "FAM", 2797: "Aldolase", 3794: "SAM_mtases", 3035: "WD40"
    }}

In [41]:
feature_clusters_flat = {
    layer: list(set().union(*[latents.keys() for latents in clusters.values()]))
    for layer, clusters in feature_clusters.items()}

# Convert feature_clusters_1pvg dict to just layer: list of latents 
feature_clusters_1pvg_flat = {
    layer: list(set().union(*[latents.keys() for latents in clusters.values()]))
    for layer, clusters in feature_clusters_1pvg.items()}

In [42]:
feature_clusters_1pvg_flat

{4: [2112,
  3170,
  1509,
  3717,
  1196,
  2511,
  527,
  2929,
  1297,
  3229,
  3544,
  1468,
  3069],
 8: [2529, 2066, 3903, 3159, 1055],
 12: [1796, 3943, 2472, 1204, 1145, 1082],
 16: [3077, 1353, 1166, 1814, 3994, 1597]}

In [None]:
feature_clusters_1pvg[4][]

In [51]:
def to_resized_latex(df, floatfmt="{:.2g}".format):
    s = df.to_latex(index=False, float_format=floatfmt, escape=False)
    s = s.replace("_", r"\_")  # escape underscores in headers/cells
    return (
        r"\setlength{\tabcolsep}{4pt}\small" "\n"
        r"\resizebox{\linewidth}{!}{" "\n" + s + "\n}"
    )

results = get_top_interpro_for_latents(feature_clusters_flat, top_k=5)

for (layer, latent), tabs in results.items():
    by_l2fc = tabs["by_l2fc"]
    by_ain  = tabs["by_ain"]

    if by_l2fc.empty and by_ain.empty:
        continue

    if not by_l2fc.empty:
        print(f"\nLayer {layer}, Latent {latent}, manual: {feature_clusters_2b61_onelevel[layer][latent]} — top by L2FC")
        print(to_resized_latex(by_l2fc))

    if not by_ain.empty:
        print(f"\nLayer {layer}, Latent {latent}, manual: {feature_clusters_2b61_onelevel[layer][latent]} — top by A\_in")
        print(to_resized_latex(by_ain))



Layer 4, Latent 3326, manual: H — top by L2FC
\setlength{\tabcolsep}{4pt}\small
\resizebox{\linewidth}{!}{
\begin{tabular}{lrrllrrrrr}
\toprule
Case & Layer & Latent ID & InterPro ID & InterPro Name & A\_in & A\_out & L2FC & Ratio & Bonferroni adj. p-value \\
\midrule
MetX & 4 & 3326 & IPR036638 & Helix-loop-helix DNA-binding domain superfamily & 0.036 & 0.005 & 2.8 & 7.2 & 0.0032 \\
MetX & 4 & 3326 & IPR001356 & Homeodomain & 0.035 & 0.0049 & 2.8 & 7 & 2.8e-09 \\
MetX & 4 & 3326 & IPR011598 & Myc-type, basic helix-loop-helix (bHLH) domain & 0.035 & 0.005 & 2.8 & 6.9 & 0.0011 \\
MetX & 4 & 3326 & IPR017970 & Homeobox, conserved site & 0.035 & 0.0049 & 2.8 & 6.9 & 0.00012 \\
MetX & 4 & 3326 & IPR036236 & Zinc finger C2H2 superfamily & 0.035 & 0.0049 & 2.8 & 6.9 & 4.5e-10 \\
\bottomrule
\end{tabular}

}

Layer 4, Latent 3326, manual: H — top by A\_in
\setlength{\tabcolsep}{4pt}\small
\resizebox{\linewidth}{!}{
\begin{tabular}{lrrllrrrrr}
\toprule
Case & Layer & Latent ID & InterPro ID &

In [44]:
results = get_top_interpro_for_latents(feature_clusters_1pvg_flat, top_k=5)
for (layer, latent), tabs in results.items():
    print(f"\nLayer {layer}, Latent {latent} — top by L2FC")
    print(tabs["by_l2fc"].to_latex(index=False, float_format="{:.2g}".format, escape=False))
    print(f"\nLayer {layer}, Latent {latent} — top by A_in")
    print(tabs["by_ain"].to_latex(index=False, float_format="{:.2g}".format, escape=False))


Layer 4, Latent 2112 — top by L2FC
\begin{tabular}{llllllllll}
\toprule
Case & Layer & Latent ID & InterPro ID & InterPro Name & A_in & A_out & L2FC & Ratio & Bonferroni adj. p-value \\
\midrule
\bottomrule
\end{tabular}


Layer 4, Latent 2112 — top by A_in
\begin{tabular}{llllllllll}
\toprule
Case & Layer & Latent ID & InterPro ID & InterPro Name & A_in & A_out & L2FC & Ratio & Bonferroni adj. p-value \\
\midrule
\bottomrule
\end{tabular}


Layer 4, Latent 3170 — top by L2FC
\begin{tabular}{llllllllll}
\toprule
Case & Layer & Latent ID & InterPro ID & InterPro Name & A_in & A_out & L2FC & Ratio & Bonferroni adj. p-value \\
\midrule
\bottomrule
\end{tabular}


Layer 4, Latent 3170 — top by A_in
\begin{tabular}{llllllllll}
\toprule
Case & Layer & Latent ID & InterPro ID & InterPro Name & A_in & A_out & L2FC & Ratio & Bonferroni adj. p-value \\
\midrule
\bottomrule
\end{tabular}


Layer 4, Latent 1509 — top by L2FC
\begin{tabular}{llllllllll}
\toprule
Case & Layer & Latent ID & InterPro

In [36]:
def get_top_families_for_latents(layer_latent_pairs, 
                                latents_family_effect_size, 
                                latents_family_pvals, 
                                interpro_annotations_nonzero,
                                latent_ids_both_proteins,
                                n_tests,
                                top_k=5, 
                                require_pvalue=False, 
                                pvalue_threshold=0.05):
    """
    Get top K families for given layer-latent pairs based on both L2FC and raw activation.
    
    Parameters:
    -----------
    layer_latent_pairs : list of tuples
        List of (layer, latent_id) pairs. Layer should be the actual layer (4, 8, 12, etc.)
        and latent_id should be the latent index within that layer (0-4095).
    latents_family_effect_size : torch.Tensor
        Effect size tensor of shape (383, 10096, 4)
    latents_family_pvals : torch.Tensor  
        P-values tensor of shape (383, 10096)
    interpro_annotations_nonzero : pd.DataFrame
        DataFrame with InterPro annotations
    latent_ids_both_proteins : list
        Mapping from tensor indices to actual latent IDs
    n_tests : int
        Number of tests for Bonferroni correction
    top_k : int
        Number of top families to return (default 5)
    require_pvalue : bool
        Whether to filter by p-value threshold (default False)
    pvalue_threshold : float
        P-value threshold for Bonferroni correction (default 0.05)
    
    Returns:
    --------
    dict: Results for each latent with both L2FC and activation-based rankings
    """
    
    # Create mapping from actual latent ID to tensor index
    latent_id_to_index = {latent_id: i for i, latent_id in enumerate(latent_ids_both_proteins)}
    
    results = {}
    
    for layer, latent_id in layer_latent_pairs:
        # Convert layer and latent_id to actual latent ID used in the data
        actual_latent_id = ((layer // 4) - 1) * 4096 + latent_id
        
        # Check if this latent is in our data
        if actual_latent_id not in latent_id_to_index:
            print(f"Warning: Latent {actual_latent_id} (layer {layer}, latent {latent_id}) not found in data")
            continue
            
        latent_idx = latent_id_to_index[actual_latent_id]
        
        # Get data for this latent
        mean_in_family = latents_family_effect_size[latent_idx, :, 0].numpy()
        mean_out_family = latents_family_effect_size[latent_idx, :, 1].numpy()
        pvals = latents_family_pvals[latent_idx, :].numpy()
        
        # Calculate L2FC (with small epsilon to avoid division by zero)
        l2fc = np.log2(mean_in_family / (mean_out_family + 1e-8))
        
        # Create mask for families that pass p-value test if required
        if require_pvalue:
            bonf_corrected_pvals = pvals * n_tests
            pvalue_mask = bonf_corrected_pvals < pvalue_threshold
        else:
            pvalue_mask = np.ones_like(pvals, dtype=bool)
        
        # Get indices of families that pass the filter
        valid_families = np.where(pvalue_mask)[0]
        
        if len(valid_families) == 0:
            print(f"No families pass p-value threshold for latent {actual_latent_id}")
            results[f"layer_{layer}_latent_{latent_id}"] = {
                'actual_latent_id': actual_latent_id,
                'top_by_l2fc': [],
                'top_by_activation': []
            }
            continue
        
        # Method 1: Top families by L2FC
        l2fc_valid = l2fc[valid_families]
        l2fc_order = np.argsort(l2fc_valid)[::-1]  # Sort descending
        top_l2fc_indices = valid_families[l2fc_order[:top_k]]
        
        # Method 2: Top families by raw activation in family
        activation_valid = mean_in_family[valid_families]
        activation_order = np.argsort(activation_valid)[::-1]  # Sort descending
        top_activation_indices = valid_families[activation_order[:top_k]]
        
        # Create detailed results
        top_l2fc_details = []
        for idx in top_l2fc_indices:
            family_info = {
                'family_idx': int(idx),
                'interpro_id': interpro_annotations_nonzero.iloc[idx]['ENTRY_AC'],
                'interpro_name': interpro_annotations_nonzero.iloc[idx]['ENTRY_NAME'],
                'mean_in_family': float(mean_in_family[idx]),
                'mean_out_family': float(mean_out_family[idx]),
                'l2fc': float(l2fc[idx]),
                'pvalue': float(pvals[idx]),
                'bonf_pvalue': float(pvals[idx] * n_tests)
            }
            top_l2fc_details.append(family_info)
        
        top_activation_details = []
        for idx in top_activation_indices:
            family_info = {
                'family_idx': int(idx),
                'interpro_id': interpro_annotations_nonzero.iloc[idx]['ENTRY_AC'],
                'interpro_name': interpro_annotations_nonzero.iloc[idx]['ENTRY_NAME'],
                'mean_in_family': float(mean_in_family[idx]),
                'mean_out_family': float(mean_out_family[idx]),
                'l2fc': float(l2fc[idx]),
                'pvalue': float(pvals[idx]),
                'bonf_pvalue': float(pvals[idx] * n_tests)
            }
            top_activation_details.append(family_info)
        
        results[f"layer_{layer}_latent_{latent_id}"] = {
            'actual_latent_id': actual_latent_id,
            'top_by_l2fc': top_l2fc_details,
            'top_by_activation': top_activation_details
        }
    
    return results


def print_top_families_summary(results):
    """Print a nice summary of the results"""
    for latent_key, data in results.items():
        print(f"\n=== {latent_key.upper()} (Actual ID: {data['actual_latent_id']}) ===")
        
        print("\nTop 5 by L2FC:")
        for i, family in enumerate(data['top_by_l2fc'], 1):
            print(f"  {i}. {family['interpro_name']}")
            print(f"     ID: {family['interpro_id']}, L2FC: {family['l2fc']:.2f}")
            print(f"     In/Out: {family['mean_in_family']:.3f}/{family['mean_out_family']:.3f}")
            print(f"     Bonf p-val: {family['bonf_pvalue']:.2e}")
        
        print("\nTop 5 by Raw Activation:")
        for i, family in enumerate(data['top_by_activation'], 1):
            print(f"  {i}. {family['interpro_name']}")
            print(f"     ID: {family['interpro_id']}, Activation: {family['mean_in_family']:.3f}")
            print(f"     L2FC: {family['l2fc']:.2f}, Bonf p-val: {family['bonf_pvalue']:.2e}")


# Usage examples:
# Example 1: Get top families for specific layer-latent pairs without p-value filtering
layer_latent_pairs = [
    (12, 1204),  # Layer 12, latent 1204 
    (16, 1504),  # Layer 16, latent 1504
    (20, 142),   # Layer 20, latent 142
]

results_no_pval = get_top_families_for_latents(
    layer_latent_pairs=layer_latent_pairs,
    latents_family_effect_size=latents_family_effect_size,
    latents_family_pvals=latents_family_pvals,
    interpro_annotations_nonzero=interpro_annotations_nonzero,
    latent_ids_both_proteins=latent_ids_both_proteins,
    n_tests=n_tests,
    top_k=5,
    require_pvalue=False
)

print_top_families_summary(results_no_pval)


# Example 2: Get top families with p-value filtering
results_with_pval = get_top_families_for_latents(
    layer_latent_pairs=layer_latent_pairs,
    latents_family_effect_size=latents_family_effect_size,
    latents_family_pvals=latents_family_pvals,
    interpro_annotations_nonzero=interpro_annotations_nonzero,
    latent_ids_both_proteins=latent_ids_both_proteins,
    n_tests=n_tests,
    top_k=5,
    require_pvalue=True,
    pvalue_threshold=0.05
)

print("\n" + "="*80)
print("RESULTS WITH P-VALUE FILTERING:")
print("="*80)
print_top_families_summary(results_with_pval)


# Example 3: Convert to DataFrame for easier analysis
def results_to_dataframe(results, method='l2fc'):
    """Convert results to a pandas DataFrame"""
    rows = []
    for latent_key, data in results.items():
        layer = int(latent_key.split('_')[1])
        latent_id = int(latent_key.split('_')[3])
        
        method_key = 'top_by_l2fc' if method == 'l2fc' else 'top_by_activation'
        
        for rank, family in enumerate(data[method_key], 1):
            row = {
                'layer': layer,
                'latent_id': latent_id,
                'actual_latent_id': data['actual_latent_id'],
                'rank': rank,
                'interpro_id': family['interpro_id'],
                'interpro_name': family['interpro_name'],
                'mean_in_family': family['mean_in_family'],
                'mean_out_family': family['mean_out_family'],
                'l2fc': family['l2fc'],
                'pvalue': family['pvalue'],
                'bonf_pvalue': family['bonf_pvalue']
            }
            rows.append(row)
    
    return pd.DataFrame(rows)

# Create DataFrames
df_l2fc = results_to_dataframe(results_no_pval, method='l2fc')
df_activation = results_to_dataframe(results_no_pval, method='activation')

print("\nTop families by L2FC (DataFrame):")
print(df_l2fc.head(10))

print("\nTop families by activation (DataFrame):")
print(df_activation.head(10))


=== LAYER_12_LATENT_1204 (Actual ID: 9396) ===

Top 5 by L2FC:
  1. Heat shock protein Hsp90, N-terminal
     ID: IPR020575, L2FC: 8.45
     In/Out: 1.425/0.004
     Bonf p-val: 3.94e-16
  2. Heat shock protein Hsp90 family
     ID: IPR001404, L2FC: 8.45
     In/Out: 1.425/0.004
     Bonf p-val: 3.94e-16
  3. Heat shock protein Hsp90, conserved site
     ID: IPR019805, L2FC: 8.44
     In/Out: 1.456/0.004
     Bonf p-val: 2.94e-13
  4. DNA mismatch repair protein MutL/Mlh/Pms-like
     ID: IPR038973, L2FC: 8.38
     In/Out: 1.490/0.004
     Bonf p-val: 1.59e-07
  5. DNA mismatch repair protein, MutL
     ID: IPR020667, L2FC: 8.38
     In/Out: 1.490/0.004
     Bonf p-val: 1.59e-07

Top 5 by Raw Activation:
  1. MutL, C-terminal domain superfamily
     ID: IPR037198, Activation: 1.490
     L2FC: 8.38, Bonf p-val: 1.59e-07
  2. DNA mismatch repair protein, MutL
     ID: IPR020667, Activation: 1.490
     L2FC: 8.38, Bonf p-val: 1.59e-07
  3. DNA mismatch repair protein MutL/Mlh/Pms-like
  

  l2fc = np.log2(mean_in_family / (mean_out_family + 1e-8))


In [12]:
offsets

{'4': 0,
 '8': 4096,
 '12': 8192,
 '16': 12288,
 '20': 16384,
 '24': 20480,
 '28': 24576}

In [13]:
latent_ids_both_proteins[0:10]

[100, 181, 237, 340, 443, 495, 527, 601, 794, 798]

In [14]:
2112+8192

10304

In [15]:
latent_ids_both_proteins

[100,
 181,
 237,
 340,
 443,
 495,
 527,
 601,
 794,
 798,
 963,
 1096,
 1196,
 1297,
 1374,
 1468,
 1474,
 1509,
 1682,
 1690,
 1712,
 1807,
 1949,
 2005,
 2112,
 2277,
 2311,
 2340,
 2443,
 2511,
 2850,
 2929,
 2947,
 2983,
 3069,
 3153,
 3170,
 3177,
 3229,
 3326,
 3351,
 3544,
 3612,
 3634,
 3651,
 3701,
 3717,
 3764,
 3788,
 3832,
 4527,
 4584,
 5671,
 6262,
 6582,
 6758,
 6773,
 6871,
 7415,
 7480,
 8017,
 8403,
 8531,
 8757,
 8859,
 8861,
 8990,
 8995,
 9091,
 9096,
 9274,
 9313,
 9396,
 9448,
 9569,
 9606,
 9710,
 9988,
 10010,
 10249,
 10304,
 10342,
 10494,
 10664,
 10666,
 10669,
 10670,
 10704,
 10820,
 10989,
 11037,
 11227,
 11431,
 11690,
 11728,
 11735,
 11749,
 11790,
 11818,
 11943,
 11986,
 12000,
 12041,
 12135,
 12152,
 12211,
 12236,
 12334,
 12445,
 12645,
 12675,
 12679,
 12768,
 12894,
 12919,
 12996,
 13025,
 13088,
 13099,
 13146,
 13216,
 13356,
 13363,
 13454,
 13491,
 13497,
 13524,
 13568,
 13641,
 13668,
 13676,
 13687,
 13735,
 13737,
 13740,
 13748,
 

In [16]:
latent_ids_both_proteins.index(10304)

80

In [17]:
latent_ids_both_proteins[80]

10304

In [18]:
latents_family_pvals[80].shape

torch.Size([10096])

In [59]:
domain_annotations_threshold_3 = {}
domain_annotations_threshold_4 = {}
for pair in metx_latents.items():
    layer_ind = pair[0]
    for latent_ind in pair[1]:
        flat_ind = offsets[str(layer_ind)] + latent_ind
        epsilon = 1e-4
        latent_i = latent_ids_both_proteins.index(flat_ind)
        domain_annotations_threshold_3[f"{layer_ind}_{latent_ind}"] = []
        domain_annotations_threshold_4[f"{layer_ind}_{latent_ind}"] = []
        for family_j in range(latents_family_pvals.shape[1]):
            if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
                if latents_family_effect_size[latent_i,family_j,1] < 0.05 and latents_family_effect_size[latent_i,family_j,0] > 0.1:
                    l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/(latents_family_effect_size[latent_i,family_j,1]+epsilon))
                    if l2fc > 3:
                        domain_annotations_threshold_3[f"{layer_ind}_{latent_ind}"] += [interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME, latents_family_effect_size[latent_i,family_j,0].item(), latents_family_effect_size[latent_i,family_j,1].item(), l2fc.item()]
                    if l2fc > 4:
                        domain_annotations_threshold_4[f"{layer_ind}_{latent_ind}"] += [interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME, latents_family_effect_size[latent_i,family_j,0].item(), latents_family_effect_size[latent_i,family_j,1].item(), l2fc.item()]

In [60]:
import json

# Save domain annotations with threshold 3 to JSON
with open('../results/domain_annotations_metx_threshold_3.json', 'w') as f:
    json.dump(domain_annotations_threshold_3, f, indent=2)

# Save domain annotations with threshold 4 to JSON 
with open('../results/domain_annotations_metx_threshold_4.json', 'w') as f:
    json.dump(domain_annotations_threshold_4, f, indent=2)

In [61]:
domain_annotations_threshold_3 = {}
domain_annotations_threshold_4 = {}
for pair in top2_latents.items():
    layer_ind = pair[0]
    for latent_ind in pair[1]:
        flat_ind = offsets[str(layer_ind)] + latent_ind
        epsilon = 1e-4
        latent_i = latent_ids_both_proteins.index(flat_ind)
        domain_annotations_threshold_3[f"{layer_ind}_{latent_ind}"] = []
        domain_annotations_threshold_4[f"{layer_ind}_{latent_ind}"] = []
        for family_j in range(latents_family_pvals.shape[1]):
            if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
                if latents_family_effect_size[latent_i,family_j,1] < 0.05 and latents_family_effect_size[latent_i,family_j,0] > 0.1:
                    l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/(latents_family_effect_size[latent_i,family_j,1]+epsilon))
                    if l2fc > 3:
                        domain_annotations_threshold_3[f"{layer_ind}_{latent_ind}"] += [interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME, latents_family_effect_size[latent_i,family_j,0].item(), latents_family_effect_size[latent_i,family_j,1].item(), l2fc.item()]
                    if l2fc > 4:
                        domain_annotations_threshold_4[f"{layer_ind}_{latent_ind}"] += [interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME, latents_family_effect_size[latent_i,family_j,0].item(), latents_family_effect_size[latent_i,family_j,1].item(), l2fc.item()]

In [62]:
import json

# Save domain annotations with threshold 3 to JSON
with open('../results/domain_annotations_top2_threshold_3.json', 'w') as f:
    json.dump(domain_annotations_threshold_3, f, indent=2)

# Save domain annotations with threshold 4 to JSON 
with open('../results/domain_annotations_top2_threshold_4.json', 'w') as f:
    json.dump(domain_annotations_threshold_4, f, indent=2)

In [47]:
domain_annotations

{('4', 2443): [],
 ('4', 3651): [],
 ('4', 963): [],
 ('4', 340): [],
 ('4', 237): [],
 ('4', 1474): [],
 ('4', 794): [],
 ('4', 443): [],
 ('4', 2340): [],
 ('4', 3788): [],
 ('4', 3701): [],
 ('4', 2311): [],
 ('4', 2277): [],
 ('4', 3153): [],
 ('4', 798): [],
 ('4', 3634): [],
 ('4', 1682): [],
 ('4', 1690): [],
 ('4', 3764): [],
 ('4', 3326): [],
 ('4', 1096): [],
 ('4', 3351): [],
 ('4', 1712): [],
 ('4', 181): [],
 ('4', 3177): [],
 ('4', 3832): [],
 ('4', 1807): [],
 ('4', 3612): [],
 ('4', 495): [],
 ('4',
  1297): ['Large ribosomal subunit protein uL30-like, ferredoxin-like fold domain', tensor(0.0521), tensor(0.0034), tensor(3.9415), 'Large ribosomal subunit protein bL36', tensor(0.0731), tensor(0.0033), tensor(4.4604), 'Small ribosomal subunit protein uS14', tensor(0.0292), tensor(0.0034), tensor(3.0965), 'Large ribosomal subunit protein uL30, bacterial-type', tensor(0.0573), tensor(0.0034), tensor(4.0791), 'Large ribosomal subunit protein bL36 superfamily', tensor(0.0731),

In [None]:
n_tests = 28672*10096

layer_ind = 12
latent_ind = 1256   
flat_ind = offsets[str(layer_ind)] + latent_ind
epsilon = 1e-6
latent_i = latent_ids_both_proteins.index(flat_ind) #80
annotated_latents = []
print(layer_ind, latent_ind)
for family_j in range(latents_family_pvals.shape[1]):
    if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
        if latents_family_effect_size[latent_i,family_j,1] < 0.2:
            l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/(latents_family_effect_size[latent_i,family_j,1]+epsilon))
            # print(f"L2FC: {l2fc}")
            if l2fc > 3: #if latents_family_effect_size[latent_i,family_j,0] > 0.3:
                annotated_latents += [latent_ids_both_proteins[latent_i]]
                print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                print(f"    Family {interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME}")
                print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")

12 1256
Latent 9448 is sig assoc w family 134
    Family EF-Hand 1, calcium-binding site
    Mean in-family: 0.5439120531082153
    Mean out-of-family: 0.04676192253828049
    Log2FC: 3.5399672985076904
Latent 9448 is sig assoc w family 787
    Family Phosphoribosyltransferase domain
    Mean in-family: 0.39505958557128906
    Mean out-of-family: 0.04600566625595093
    Log2FC: 3.102186918258667
Latent 9448 is sig assoc w family 1010
    Family EF-hand domain
    Mean in-family: 0.5556498765945435
    Mean out-of-family: 0.04653194919228554
    Log2FC: 3.577882766723633
Latent 9448 is sig assoc w family 2727
    Family OBG-type guanine nucleotide-binding (G) domain
    Mean in-family: 0.6347910761833191
    Mean out-of-family: 0.04703279212117195
    Log2FC: 3.7545430660247803
Latent 9448 is sig assoc w family 4162
    Family Small GTPase
    Mean in-family: 0.3918114900588989
    Mean out-of-family: 0.047304440289735794
    Log2FC: 3.050112247467041
Latent 9448 is sig assoc w family 4

In [44]:
n_tests = 28672*10096

layer_ind = 12
latent_ind = 1256   
flat_ind = offsets[str(layer_ind)] + latent_ind
epsilon = 1e-6
latent_i = latent_ids_both_proteins.index(flat_ind) #80
annotated_latents = []
print(layer_ind, latent_ind)
for family_j in range(latents_family_pvals.shape[1]):
    if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
        if latents_family_effect_size[latent_i,family_j,1] < 0.2:
            l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/(latents_family_effect_size[latent_i,family_j,1]+epsilon))
            # print(f"L2FC: {l2fc}")
            if l2fc > 3: #if latents_family_effect_size[latent_i,family_j,0] > 0.3:
                annotated_latents += [latent_ids_both_proteins[latent_i]]
                print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                print(f"    Family {interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME}")
                print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")

12 1256
Latent 9448 is sig assoc w family 134
    Family EF-Hand 1, calcium-binding site
    Mean in-family: 0.5439120531082153
    Mean out-of-family: 0.04676192253828049
    Log2FC: 3.5399672985076904
Latent 9448 is sig assoc w family 787
    Family Phosphoribosyltransferase domain
    Mean in-family: 0.39505958557128906
    Mean out-of-family: 0.04600566625595093
    Log2FC: 3.102186918258667
Latent 9448 is sig assoc w family 1010
    Family EF-hand domain
    Mean in-family: 0.5556498765945435
    Mean out-of-family: 0.04653194919228554
    Log2FC: 3.577882766723633
Latent 9448 is sig assoc w family 2727
    Family OBG-type guanine nucleotide-binding (G) domain
    Mean in-family: 0.6347910761833191
    Mean out-of-family: 0.04703279212117195
    Log2FC: 3.7545430660247803
Latent 9448 is sig assoc w family 4162
    Family Small GTPase
    Mean in-family: 0.3918114900588989
    Mean out-of-family: 0.047304440289735794
    Log2FC: 3.050112247467041
Latent 9448 is sig assoc w family 4

In [None]:
n_tests = 28672*10096

layer_ind = 12
latent_ind = 1256   
flat_ind = offsets[str(layer_ind)] + latent_ind
epsilon = 1e-6
latent_i = latent_ids_both_proteins.index(flat_ind) #80
annotated_latents = []
print(layer_ind, latent_ind)
for family_j in range(latents_family_pvals.shape[1]):
    if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
        if latents_family_effect_size[latent_i,family_j,1] < 0.2:
            l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/(latents_family_effect_size[latent_i,family_j,1]+epsilon))
            # print(f"L2FC: {l2fc}")
            if l2fc > 3: #if latents_family_effect_size[latent_i,family_j,0] > 0.3:
                annotated_latents += [latent_ids_both_proteins[latent_i]]
                print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                print(f"    Family {interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME}")
                print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")

12 1256
Latent 9448 is sig assoc w family 134
    Family EF-Hand 1, calcium-binding site
    Mean in-family: 0.5439120531082153
    Mean out-of-family: 0.04676192253828049
    Log2FC: 3.5399672985076904
Latent 9448 is sig assoc w family 787
    Family Phosphoribosyltransferase domain
    Mean in-family: 0.39505958557128906
    Mean out-of-family: 0.04600566625595093
    Log2FC: 3.102186918258667
Latent 9448 is sig assoc w family 1010
    Family EF-hand domain
    Mean in-family: 0.5556498765945435
    Mean out-of-family: 0.04653194919228554
    Log2FC: 3.577882766723633
Latent 9448 is sig assoc w family 2727
    Family OBG-type guanine nucleotide-binding (G) domain
    Mean in-family: 0.6347910761833191
    Mean out-of-family: 0.04703279212117195
    Log2FC: 3.7545430660247803
Latent 9448 is sig assoc w family 4162
    Family Small GTPase
    Mean in-family: 0.3918114900588989
    Mean out-of-family: 0.047304440289735794
    Log2FC: 3.050112247467041
Latent 9448 is sig assoc w family 4

In [None]:
n_tests = 28672*10096

layer_ind = 12
latent_ind = 1256   
flat_ind = offsets[str(layer_ind)] + latent_ind
epsilon = 1e-6
latent_i = latent_ids_both_proteins.index(flat_ind) #80
annotated_latents = []
print(layer_ind, latent_ind)
for family_j in range(latents_family_pvals.shape[1]):
    if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
        if latents_family_effect_size[latent_i,family_j,1] < 0.2:
            l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/(latents_family_effect_size[latent_i,family_j,1]+epsilon))
            # print(f"L2FC: {l2fc}")
            if l2fc > 3: #if latents_family_effect_size[latent_i,family_j,0] > 0.3:
                annotated_latents += [latent_ids_both_proteins[latent_i]]
                print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                print(f"    Family {interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME}")
                print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")

12 1256
Latent 9448 is sig assoc w family 134
    Family EF-Hand 1, calcium-binding site
    Mean in-family: 0.5439120531082153
    Mean out-of-family: 0.04676192253828049
    Log2FC: 3.5399672985076904
Latent 9448 is sig assoc w family 787
    Family Phosphoribosyltransferase domain
    Mean in-family: 0.39505958557128906
    Mean out-of-family: 0.04600566625595093
    Log2FC: 3.102186918258667
Latent 9448 is sig assoc w family 1010
    Family EF-hand domain
    Mean in-family: 0.5556498765945435
    Mean out-of-family: 0.04653194919228554
    Log2FC: 3.577882766723633
Latent 9448 is sig assoc w family 2727
    Family OBG-type guanine nucleotide-binding (G) domain
    Mean in-family: 0.6347910761833191
    Mean out-of-family: 0.04703279212117195
    Log2FC: 3.7545430660247803
Latent 9448 is sig assoc w family 4162
    Family Small GTPase
    Mean in-family: 0.3918114900588989
    Mean out-of-family: 0.047304440289735794
    Log2FC: 3.050112247467041
Latent 9448 is sig assoc w family 4

In [None]:
n_tests = 28672*10096

layer_ind = 12
latent_ind = 1256   
flat_ind = offsets[str(layer_ind)] + latent_ind
epsilon = 1e-6
latent_i = latent_ids_both_proteins.index(flat_ind) #80
annotated_latents = []
print(layer_ind, latent_ind)
for family_j in range(latents_family_pvals.shape[1]):
    if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
        if latents_family_effect_size[latent_i,family_j,1] < 0.2:
            l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/(latents_family_effect_size[latent_i,family_j,1]+epsilon))
            # print(f"L2FC: {l2fc}")
            if l2fc > 3: #if latents_family_effect_size[latent_i,family_j,0] > 0.3:
                annotated_latents += [latent_ids_both_proteins[latent_i]]
                print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                print(f"    Family {interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME}")
                print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")

12 1256
Latent 9448 is sig assoc w family 134
    Family EF-Hand 1, calcium-binding site
    Mean in-family: 0.5439120531082153
    Mean out-of-family: 0.04676192253828049
    Log2FC: 3.5399672985076904
Latent 9448 is sig assoc w family 787
    Family Phosphoribosyltransferase domain
    Mean in-family: 0.39505958557128906
    Mean out-of-family: 0.04600566625595093
    Log2FC: 3.102186918258667
Latent 9448 is sig assoc w family 1010
    Family EF-hand domain
    Mean in-family: 0.5556498765945435
    Mean out-of-family: 0.04653194919228554
    Log2FC: 3.577882766723633
Latent 9448 is sig assoc w family 2727
    Family OBG-type guanine nucleotide-binding (G) domain
    Mean in-family: 0.6347910761833191
    Mean out-of-family: 0.04703279212117195
    Log2FC: 3.7545430660247803
Latent 9448 is sig assoc w family 4162
    Family Small GTPase
    Mean in-family: 0.3918114900588989
    Mean out-of-family: 0.047304440289735794
    Log2FC: 3.050112247467041
Latent 9448 is sig assoc w family 4

In [None]:
n_tests = 28672*10096

layer_ind = 12
latent_ind = 1256   
flat_ind = offsets[str(layer_ind)] + latent_ind
epsilon = 1e-6
latent_i = latent_ids_both_proteins.index(flat_ind) #80
annotated_latents = []
print(layer_ind, latent_ind)
for family_j in range(latents_family_pvals.shape[1]):
    if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
        if latents_family_effect_size[latent_i,family_j,1] < 0.2:
            l2fc = np.log2((latents_family_effect_size[latent_i,family_j,0]+epsilon)/(latents_family_effect_size[latent_i,family_j,1]+epsilon))
            # print(f"L2FC: {l2fc}")
            if l2fc > 3: #if latents_family_effect_size[latent_i,family_j,0] > 0.3:
                annotated_latents += [latent_ids_both_proteins[latent_i]]
                print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                print(f"    Family {interpro_annotations_nonzero.iloc[family_j].ENTRY_NAME}")
                print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")

12 1256
Latent 9448 is sig assoc w family 134
    Family EF-Hand 1, calcium-binding site
    Mean in-family: 0.5439120531082153
    Mean out-of-family: 0.04676192253828049
    Log2FC: 3.5399672985076904
Latent 9448 is sig assoc w family 787
    Family Phosphoribosyltransferase domain
    Mean in-family: 0.39505958557128906
    Mean out-of-family: 0.04600566625595093
    Log2FC: 3.102186918258667
Latent 9448 is sig assoc w family 1010
    Family EF-hand domain
    Mean in-family: 0.5556498765945435
    Mean out-of-family: 0.04653194919228554
    Log2FC: 3.577882766723633
Latent 9448 is sig assoc w family 2727
    Family OBG-type guanine nucleotide-binding (G) domain
    Mean in-family: 0.6347910761833191
    Mean out-of-family: 0.04703279212117195
    Log2FC: 3.7545430660247803
Latent 9448 is sig assoc w family 4162
    Family Small GTPase
    Mean in-family: 0.3918114900588989
    Mean out-of-family: 0.047304440289735794
    Log2FC: 3.050112247467041
Latent 9448 is sig assoc w family 4

In [None]:
import numpy as np

mu_out = latents_family_effect_size[:, :, 1]
mask = mu_out < 0.05

num_true = int(np.count_nonzero(mask))
total = int(mask.size)
print(f"{num_true} / {total} ({num_true/total:.2%}) latent–family pairs with mu_out < 0.05")

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'builtin_function_or_method'

In [42]:
import numpy as np
import pandas as pd

# Assumes:
# - latents_family_effect_size shape: [num_latents, num_families, 2]
#   [:, :, 0] = in-family mean, [:, :, 1] = out-of-family mean
# - latents_family_pvals shape: [num_latents, num_families]
# - latent_ids_both_proteins is a list mapping latent index -> global latent id
# - interpro_annotations_nonzero has column 'ENTRY_NAME' indexed by family_j

mu_in  = latents_family_effect_size[:, :, 0]
mu_out = latents_family_effect_size[:, :, 1]

# Filter on out-of-family mean < 0.05
mask = mu_out < 0.05

# Optional: small epsilon to avoid inf when mu_out≈0; set to 0.0 for exact ratios
eps = 0.0  # e.g., use 1e-6 if you prefer stabilization

ratio = (mu_in + eps) / (mu_out + eps)
l2fc  = np.log2(ratio)

rows = []
ii, jj = np.where(mask)
for i, j in zip(ii, jj):
    rows.append({
        "latent_idx": int(i),
        "latent_id": int(latent_ids_both_proteins[i]) if i < len(latent_ids_both_proteins) else i,
        "family_idx": int(j),
        "family_name": str(interpro_annotations_nonzero.iloc[j].ENTRY_NAME) if hasattr(interpro_annotations_nonzero.iloc[j], "ENTRY_NAME") else j,
        "mu_in": float(mu_in[i, j]),
        "mu_out": float(mu_out[i, j]),
        "ratio": float(ratio[i, j]),
        "log2fc": float(l2fc[i, j]),
        "pval": float(latents_family_pvals[i, j]) if 'latents_family_pvals' in globals() else np.nan,
    })

df_ratios = pd.DataFrame(rows).sort_values(["ratio", "log2fc"], ascending=[False, False]).reset_index(drop=True)
df_ratios.head(20)

  l2fc  = np.log2(ratio)


KeyboardInterrupt: 

In [6]:
n_tests

289472512

In [7]:
np.argmin(latents_family_pvals[0])

tensor(8760)

In [15]:
for latent_i in range(latents_family_pvals.shape[0]):
    print(latent_i)
    break

0


In [17]:
latents_family_pvals.shape

torch.Size([383, 10096])

In [41]:
annotated_latents = []
for latent_i in range(latents_family_pvals.shape[0]):
    for family_j in range(latents_family_pvals.shape[1]):
        if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
            if latents_family_effect_size[latent_i,family_j,1] < 0.05:
                l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])
                if latents_family_effect_size[latent_i,family_j,0] > 1.05:
                    annotated_latents += [latent_ids_both_proteins[latent_i]]
                    print(f"Latent {latent_ids_both_proteins[latent_i]} is sig assoc w family {family_j}")
                    print(f"    Family {interpro_annotations_nonzero.iloc[family_j]["ENTRY_NAME"]}")
                    print(f"    Mean in-family: {latents_family_effect_size[latent_i,family_j,0]}")
                    print(f"    Mean out-of-family: {latents_family_effect_size[latent_i,family_j,1]}")
                    print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")
               

  l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])
  l2fc = np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])


Latent 9396 is sig assoc w family 274
    Family DNA mismatch repair, conserved site
    Mean in-family: 1.4902369976043701
    Mean out-of-family: 0.004479201976209879
    Log2FC: 8.378084182739258
Latent 9396 is sig assoc w family 457
    Family Heat shock protein Hsp90, conserved site
    Mean in-family: 1.4559404850006104
    Mean out-of-family: 0.00420586671680212
    Log2FC: 8.435332298278809
Latent 9396 is sig assoc w family 2000
    Family DNA mismatch repair protein, S5 domain 2-like
    Mean in-family: 1.4902369976043701
    Mean out-of-family: 0.004479201976209879
    Log2FC: 8.378084182739258
Latent 9396 is sig assoc w family 2113
    Family MutL, C-terminal, dimerisation
    Mean in-family: 1.4902369976043701
    Mean out-of-family: 0.004479201976209879
    Log2FC: 8.378084182739258
Latent 9396 is sig assoc w family 2370
    Family Heat shock protein Hsp90, N-terminal
    Mean in-family: 1.4246454238891602
    Mean out-of-family: 0.004085632972419262
    Log2FC: 8.44582748

  print(f"    Log2FC: {np.log2(latents_family_effect_size[latent_i,family_j,0]/latents_family_effect_size[latent_i,family_j,1])}")


Latent 9988 is sig assoc w family 457
    Family Heat shock protein Hsp90, conserved site
    Mean in-family: 1.5989656448364258
    Mean out-of-family: 0.008104116655886173
    Log2FC: 7.624268054962158
Latent 9988 is sig assoc w family 1229
    Family Signal transduction histidine kinase, dimerisation/phosphoacceptor domain
    Mean in-family: 1.7447575330734253
    Mean out-of-family: 0.008175770752131939
    Log2FC: 7.737456321716309
Latent 9988 is sig assoc w family 1324
    Family Signal transduction histidine kinase-related protein, C-terminal
    Mean in-family: 1.785921335220337
    Mean out-of-family: 0.007973155938088894
    Log2FC: 7.807301998138428
Latent 9988 is sig assoc w family 1401
    Family Histidine kinase domain
    Mean in-family: 1.785921335220337
    Mean out-of-family: 0.007973155938088894
    Log2FC: 7.807301998138428
Latent 9988 is sig assoc w family 2370
    Family Heat shock protein Hsp90, N-terminal
    Mean in-family: 1.5625734329223633
    Mean out-of-f

In [42]:
ann_latents_list = list(set(annotated_latents))

In [43]:
len(ann_latents_list)

30

In [48]:
len(set(full_indices_metx))

273

In [47]:
len(set(full_indices_metx).intersection(ann_latents_list))

18

In [49]:
len(set(full_indices_top2))

134

In [46]:
len(set(full_indices_top2).intersection(ann_latents_list))

12

In [50]:
# For a table:
# Layer | Latent ID (#) | IntroPro ID | InterPro Name | out-of-family | in-family | Bonf p-value

In [64]:
metx_results_df_dict = {}
for latent_i in range(latents_family_pvals.shape[0]):
    for family_j in range(latents_family_pvals.shape[1]):
        if latents_family_pvals[latent_i,family_j] < (0.05/n_tests):
            if latents_family_effect_size[latent_i,family_j,1] < 0.05:
                if latents_family_effect_size[latent_i,family_j,0] > 1.05:

                    df_id = str(latent_i)+"_"+str(family_j)
                    cur_lat_id = latent_ids_both_proteins[latent_i]
                    layer_id = (cur_lat_id // 4096)*4
                    lat_id = cur_lat_id % 4096

                    case_ptn = "MetX"
                    if latent_ids_both_proteins[latent_i] in full_indices_top2:
                        case_ptn="Top2"

                    metx_results_df_dict[df_id] = {
                        "Case": case_ptn,
                        "Layer": layer_id,
                        "Latent ID": lat_id,
                        "InterPro ID": interpro_annotations_nonzero.iloc[family_j]["ENTRY_AC"],
                        "InterPro Name": interpro_annotations_nonzero.iloc[family_j]["ENTRY_NAME"],
                        "Activation w/ domain": float(latents_family_effect_size[latent_i,family_j,0]),
                        "Activation w/o domain": float(latents_family_effect_size[latent_i,family_j,1]),
                        "Bonferroni adj. p-value": float(latents_family_pvals[latent_i,family_j]*n_tests)
                    }

In [65]:
df_metx_results = pd.DataFrame.from_dict(metx_results_df_dict, orient='index')
df_metx_results.head()

Unnamed: 0,Case,Layer,Latent ID,InterPro ID,InterPro Name,Activation w/ domain,Activation w/o domain,Bonferroni adj. p-value
72_274,Top2,8,1204,IPR014762,"DNA mismatch repair, conserved site",1.490237,0.004479,1.594212e-07
72_457,Top2,8,1204,IPR019805,"Heat shock protein Hsp90, conserved site",1.45594,0.004206,2.943538e-13
72_2000,Top2,8,1204,IPR013507,"DNA mismatch repair protein, S5 domain 2-like",1.490237,0.004479,1.594212e-07
72_2113,Top2,8,1204,IPR014790,"MutL, C-terminal, dimerisation",1.490237,0.004479,1.594212e-07
72_2370,Top2,8,1204,IPR020575,"Heat shock protein Hsp90, N-terminal",1.424645,0.004086,3.936193e-16


In [72]:
df_metx_results.query("Case == 'MetX'") #.sort_values(by="Bonferroni adj. p-value")

Unnamed: 0,Case,Layer,Latent ID,InterPro ID,InterPro Name,Activation w/ domain,Activation w/o domain,Bonferroni adj. p-value
136_659,MetX,12,1504,IPR000073,Alpha/beta hydrolase fold-1,1.283272,0.026213,8.708278e-06
136_9059,MetX,12,1504,IPR029058,Alpha/Beta hydrolase fold,1.323345,0.020013,0.0
145_9059,MetX,12,2175,IPR029058,Alpha/Beta hydrolase fold,1.561086,0.022779,0.0
154_9059,MetX,12,2836,IPR029058,Alpha/Beta hydrolase fold,1.325072,0.045009,3.001717e-35
170_4594,MetX,12,3765,IPR003824,Undecaprenyl-diphosphatase UppP,1.132152,0.032733,2.038676e-06
177_797,MetX,16,142,IPR000873,AMP-dependent synthetase/ligase domain,2.935064,0.044304,0.01535564
177_2169,MetX,16,142,IPR015590,Aldehyde dehydrogenase domain,2.456921,0.043235,1.386195e-07
177_8825,MetX,16,142,IPR016161,Aldehyde/histidinol dehydrogenase,2.45572,0.04227,1.722333e-10
177_8826,MetX,16,142,IPR016162,"Aldehyde dehydrogenase, N-terminal",2.456921,0.043235,1.386195e-07
177_8827,MetX,16,142,IPR016163,"Aldehyde dehydrogenase, C-terminal",2.456921,0.043235,1.386195e-07


In [73]:
df_metx_results.query("Case == 'Top2'") #.sort_values(by="Bonferroni adj. p-value")

Unnamed: 0,Case,Layer,Latent ID,InterPro ID,InterPro Name,Activation w/ domain,Activation w/o domain,Bonferroni adj. p-value
72_274,Top2,8,1204,IPR014762,"DNA mismatch repair, conserved site",1.490237,0.004479,1.594212e-07
72_457,Top2,8,1204,IPR019805,"Heat shock protein Hsp90, conserved site",1.45594,0.004206,2.943538e-13
72_2000,Top2,8,1204,IPR013507,"DNA mismatch repair protein, S5 domain 2-like",1.490237,0.004479,1.594212e-07
72_2113,Top2,8,1204,IPR014790,"MutL, C-terminal, dimerisation",1.490237,0.004479,1.594212e-07
72_2370,Top2,8,1204,IPR020575,"Heat shock protein Hsp90, N-terminal",1.424645,0.004086,3.936193e-16
72_4065,Top2,8,1204,IPR001404,Heat shock protein Hsp90 family,1.424645,0.004086,3.936193e-16
72_4238,Top2,8,1204,IPR002099,DNA mismatch repair protein MutL/Mlh/PMS,1.490237,0.004479,1.594212e-07
72_6308,Top2,8,1204,IPR020667,"DNA mismatch repair protein, MutL",1.490237,0.004479,1.594212e-07
72_7213,Top2,8,1204,IPR038973,DNA mismatch repair protein MutL/Mlh/Pms-like,1.490237,0.004479,1.594212e-07
72_9682,Top2,8,1204,IPR037196,"HSP90, C-terminal domain",1.205773,0.004381,3.014633e-13


In [71]:
df_metx_results.to_csv('circuit_domain_detectors.csv', index=False)

In [83]:
print(df_metx_results.query("Case == 'Top2'").drop("Case", axis=1).to_latex(
    index=False, float_format="{:.2g}".format))

\begin{tabular}{rrllrrr}
\toprule
Layer & Latent ID & InterPro ID & InterPro Name & Activation w/ domain & Activation w/o domain & Bonferroni adj. p-value \\
\midrule
8 & 1204 & IPR014762 & DNA mismatch repair, conserved site & 1.5 & 0.0045 & 1.6e-07 \\
8 & 1204 & IPR019805 & Heat shock protein Hsp90, conserved site & 1.5 & 0.0042 & 2.9e-13 \\
8 & 1204 & IPR013507 & DNA mismatch repair protein, S5 domain 2-like & 1.5 & 0.0045 & 1.6e-07 \\
8 & 1204 & IPR014790 & MutL, C-terminal, dimerisation & 1.5 & 0.0045 & 1.6e-07 \\
8 & 1204 & IPR020575 & Heat shock protein Hsp90, N-terminal & 1.4 & 0.0041 & 3.9e-16 \\
8 & 1204 & IPR001404 & Heat shock protein Hsp90 family & 1.4 & 0.0041 & 3.9e-16 \\
8 & 1204 & IPR002099 & DNA mismatch repair protein MutL/Mlh/PMS & 1.5 & 0.0045 & 1.6e-07 \\
8 & 1204 & IPR020667 & DNA mismatch repair protein, MutL & 1.5 & 0.0045 & 1.6e-07 \\
8 & 1204 & IPR038973 & DNA mismatch repair protein MutL/Mlh/Pms-like & 1.5 & 0.0045 & 1.6e-07 \\
8 & 1204 & IPR037196 & HSP90, 

In [84]:
print(df_metx_results.query("Case == 'MetX'").drop("Case", axis=1).to_latex(
    index=False, float_format="{:.2g}".format))

\begin{tabular}{rrllrrr}
\toprule
Layer & Latent ID & InterPro ID & InterPro Name & Activation w/ domain & Activation w/o domain & Bonferroni adj. p-value \\
\midrule
12 & 1504 & IPR000073 & Alpha/beta hydrolase fold-1 & 1.3 & 0.026 & 8.7e-06 \\
12 & 1504 & IPR029058 & Alpha/Beta hydrolase fold & 1.3 & 0.02 & 0 \\
12 & 2175 & IPR029058 & Alpha/Beta hydrolase fold & 1.6 & 0.023 & 0 \\
12 & 2836 & IPR029058 & Alpha/Beta hydrolase fold & 1.3 & 0.045 & 3e-35 \\
12 & 3765 & IPR003824 & Undecaprenyl-diphosphatase UppP & 1.1 & 0.033 & 2e-06 \\
16 & 142 & IPR000873 & AMP-dependent synthetase/ligase domain & 2.9 & 0.044 & 0.015 \\
16 & 142 & IPR015590 & Aldehyde dehydrogenase domain & 2.5 & 0.043 & 1.4e-07 \\
16 & 142 & IPR016161 & Aldehyde/histidinol dehydrogenase & 2.5 & 0.042 & 1.7e-10 \\
16 & 142 & IPR016162 & Aldehyde dehydrogenase, N-terminal & 2.5 & 0.043 & 1.4e-07 \\
16 & 142 & IPR016163 & Aldehyde dehydrogenase, C-terminal & 2.5 & 0.043 & 1.4e-07 \\
16 & 142 & IPR045851 & AMP-binding e