# COMPARTMENTS Experimental Protein Localization Evidence Scores 2025
This notebook contains the scripts used to process the COMPARTMENTS Experimental Protein Localization Evidence Scores 2025 dataset for Harmonizome.

The full experimental cellular component associations were downloaded from [Jensen COMPARTMENTS](https://compartments.jensenlab.org/Downloads) on 4/8/25.

In [None]:
import pandas as pd
import datetime
import goenrich
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
from tqdm import tqdm
import json
import scanpy as sc

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data

In [None]:
gene_info = pd.read_csv('../../../tables/Homo_sapiens.gene_info.gz', sep='\t', compression='gzip')
gene_info = gene_info[(gene_info['#tax_id']==9606) & (gene_info['type_of_gene']=='protein-coding')]
gene_info['Symbol'] = gene_info['Symbol'].str.upper()
symbols = set(gene_info['Symbol'])
geneids = gene_info.set_index('Symbol')['GeneID'].to_dict()
gene_info

In [None]:
synonyms = gene_info.copy()
synonyms['Synonyms'] = synonyms.apply(lambda x: x['Synonyms'].upper().split('|') + [x['Symbol']], axis=1)
synonyms = synonyms.explode('Synonyms')
synonyms['Synonyms'] = synonyms['Synonyms'].str.upper()
synonyms = synonyms[synonyms['Synonyms']!='-']
synonyms = synonyms.set_index('Synonyms')['Symbol'].to_dict()

In [None]:
experimental = pd.read_csv('human_compartment_experiments_full.tsv', sep='\t', header=None)
experimental.columns = ['Gene ID', 'Gene', 'Component ID', 'Component',  'Source', 'Source_Score', 'Confidence']
experimental['Gene'] = experimental['Gene'].map(synonyms)
experimental = experimental[~experimental['Component'].str.contains('GO:')]
experimental = experimental.sort_values('Confidence').drop_duplicates(subset=['Gene', 'Component']).reset_index(drop=True)
experimental = experimental[['Gene', 'Gene ID', 'Component', 'Component ID', 'Confidence']]
experimental['Gene ID'] = experimental['Gene'].map(geneids)
experimental = experimental.dropna().reset_index(drop=True)
experimental['Gene ID'] = experimental['Gene ID'].astype(int)
filtered = experimental.sort_values('Confidence', ascending=False).groupby(['Component', 'Component ID']).head(1000).reset_index(drop=True)
experimental

## Process Data for SQL Ingestion

### Dataset

In [None]:
#(name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(180, 'COMPARTMENTS Experimental Protein Localization Evidence Scores 2025', 'Experimental Protein Localization Evidence Scores 2025', 'Protein subcellular localization evidence scores by integrating experimental data', 'protein-cellular component associations by integrating evidence from experimental studies', 'proteins localized to the {0} cellular component in low- or high-throughput protein localization assays from the COMPARTMENTS Experimental Protein Localization Evidence Scores 2025 dataset.', 'sets of proteins localized to cellular components in low- or high-throughput protein localization assays from the COMPARTMENTS Experimental Protein Localization Evidence Scores 2025 dataset.', 'cellular components containing {0} protein in low- or high-throughput protein localization assays from the COMPARTMENTS Experimental Protein Localization Evidence Scores 2025 dataset.', 0, 1, '2025-04-28', 'jensencompartmentexpts25', 0, 13, 3, 5, 19, 4, 'association by data aggregation', 'curated experimental data', 'mixed', 'cellular components')

### Gene

In [None]:
genes = pd.read_csv('../../../tables/gene.csv')
genes['symbol'] = genes['symbol'].str.upper()
genelist = set(genes['symbol'])
genefks = genes.set_index('symbol')['id'].to_dict()
geneids = gene_info.set_index('Symbol')['GeneID'].drop_duplicates()
genedescs = gene_info.set_index('Symbol')['description'].drop_duplicates()

In [None]:
index = 58820
geneurl = 'https://ncbi.nlm.nih.gov/gene/'

for gene in filtered['Gene'].str.upper().unique():
    if gene not in genelist:
        geneid = geneids[gene]
        print((index, gene, geneid, genedescs[gene], geneurl+str(geneid)), end=',\n')
        genefks[gene] = index
        index += 1

### Attribute

In [None]:
attributes = pd.read_csv('../../../tables/attribute.tsv', sep='\t')
attributes['name_from_naming_authority'] = attributes['name_from_naming_authority'].str.lower()
attributefks = attributes.set_index('name_from_naming_authority')['id'].to_dict()
componentids = filtered.set_index('Component')['Component ID'].drop_duplicates().to_dict()

In [None]:
#(id, name_from_naming_authority, id_from_naming_authority, url_from_naming_authority, naming_authority_fk)
index = 501781
gourl = 'http://purl.obolibrary.org/obo/GO_'


for component in filtered['Component'].unique():
    componentid = componentids[component].replace(':','_')
    if component.lower() not in attributefks:
        print((index, component, componentid,  gourl+componentid.replace('GO_',''), 1), end=',\n')
        attributefks[component.lower()] = index
        index += 1

### Gene Set

In [None]:
#(id, name_from_dataset, id_from_dataset, url_from_dataset, dataset_fk, attribute_type, attribute_fk)
index = 138100000
genesetfks = {}

for component in filtered['Component'].unique():
    componentid = componentids[component].replace(':','_')
    print((index, component, componentid, gourl+componentid.replace('GO_',''), 180, 19, attributefks[component.lower()]), end=',\n')
    genesetfks[component] = index
    index += 1

### Association

In [None]:
index = 74000000

associations = filtered.copy()
associations['Gene'] = associations['Gene'].apply(lambda x: genefks[x.upper()])
associations['Component'] = associations['Component'].apply(lambda x: genesetfks[x])

associations = associations.get(['Gene', 'Component', 'Confidence'])
associations.columns = ['gene_fk', 'gene_set_fk', 'standardized_value']
associations['threshold_value'] = 1
associations = associations.sort_values('standardized_value', ascending=False).drop_duplicates(subset=['gene_fk', 'gene_set_fk'], keep='first').reset_index(drop=True)
associations.index += index
associations = associations.rename_axis('id')
associations.to_csv('../../../harmonizome-update/jensencompartmentexpts25.csv')
associations

## Create Downloads

In [None]:
output_path='exptsdownloads/'

### Binary Matrix

In [None]:
binarymatrix = pd.crosstab(filtered['Gene'], filtered['Component'])
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = filtered.reset_index(drop=True)
edgelist.columns = ['Gene', 'Gene ID', 'Component', 'Component ID', 'Standardized Value']
edgelist['Threshold Value'] = 1
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = filtered[['Gene', 'Gene ID']].drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = filtered[['Component', 'Component ID']].drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], '', *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], '', *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene-Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute-Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Standardized Matrix

In [None]:
experimental = experimental[(experimental['Gene'].isin(filtered['Gene'])) & (experimental['Component'].isin(filtered['Component']))]
standardizedmatrix = pd.crosstab(experimental['Gene'], experimental['Component'], values=experimental['Confidence'], aggfunc='mean').fillna(0)
standardizedmatrix.to_csv(
    f'{output_path}gene_attribute_matrix_standardized.txt.gz',
    sep='\t',
    compression='gzip',
)
standardizedmatrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for component in attributeslist.index:
    component = attributeslist.loc[component]
    nodes[component['Component ID']] = {
        "type":"cellular component",
        "properties": {
            "id":component['Component ID'],
            "label":component['Component']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": "contained in",
        "target": edge['Component ID'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Component ID'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_label":edge['Component'],
            "target_id":edge['Component ID'],
            "directed":True,
            "standardized":edge['Standardized Value'],
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/jensencompoartmentexpts25.rdf', 'w') as f:
    print('@prefix gene: <https://www.ncbi.nlm.nih.gov/gene/> .', file=f)
    print('@prefix RO: <http://purl.obolibrary.org/RO_> .', file=f)
    print('@prefix GO: <https://amigo.geneontology.org/amigo/term/GO:>.', file=f)
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0001018', edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/jensencompoartmentexpts25.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'cellular component':'Gene Ontology'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/jensencompartmentexpts25_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['standardized'] = edgeframe['properties'].apply(lambda x: x['standardized'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'standardized', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/jensencompartmentexpts25_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustermap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0, figsize=(10,10), xticklabels=False, yticklabels=False)

### Gene-Gene Similarity Clustermap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustermap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, xticklabels=False, yticklabels=False)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, _, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open('exptsdownloads/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'exptsimages/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors, use_rep='X')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in COMPARTMENTS Experimental Protein Localization Evidence Scores 2025 Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=4,mindist=0.1,
    spread=0.5, 
    #maxdf=0.5, 
    mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in COMPARTMENTS Experimental Protein Localization Evidence Scores 2025 Library')
save(plot)