# HuBMAP ASCT+B Processing

This notebook contains the scripts used to process the HuBMAP ASCT+B tables for Harmonizome.
This code is based on the [ASCTB_to_GMT](https://github.com/MaayanLab/asct-b-converter/blob/main/Version%203/ASCTB_to_GMT.ipynb) and [ASCT+bDataExtraction_asctbed](https://github.com/MaayanLab/harmonizome3/blob/97e5728c777c3d4fbaf74c64b9f28aecbc30089b/ASCT%2BbDataExtraction_checked.ipynb) notebooks previously developed by members of the Ma'ayan Lab.

The 2.2 version of the ASCT+B tables were downloaded from [HuBMAP](https://humanatlas.io/asctb-tables) on 12-19-24.

In [None]:
import pandas as pd
import datetime
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data

### Fetch Tables

In [None]:
'''tables = pd.read_csv('asctb_release8.csv').set_index('Organ')['csv'].to_dict()
for table in tqdm(tables):
    pd.read_csv(tables[table], header=10).to_csv(f'ASCTB_Tables/{table}')'''

### Combine Tables and Extract Marker Genes

In [None]:
asctb = pd.DataFrame()
for table in tqdm(os.listdir('ASCTB_Tables')):
    tissueframe = pd.read_csv(f'ASCTB_Tables/{table}', index_col=0)
    asctb = pd.concat([asctb, tissueframe])

asctb = asctb.reset_index(drop=True)
asctb

In [None]:
def get_highest_resolution_cell_type(entry):
    cts = entry[['CT/1', 'CT/2', 'CT/3','CT/4']].dropna()
    return np.NaN if len(cts)==0 else (cts.index[-1], re.sub(r'(\w+)s$',r'\1', cts.iloc[-1]))

def get_highest_resolution_cell_type_id(entry):
    ctids = entry[['CT/1/ID', 'CT/2/ID', 'CT/3/ID','CT/4/ID']].dropna()
    return np.NaN if len(ctids)==0 else ctids.iloc[-1]

In [None]:
asctb['Label'] = asctb.apply(get_highest_resolution_cell_type, axis=1)
asctb['CTID'] = asctb.apply(get_highest_resolution_cell_type_id, axis=1)
asctb = asctb.dropna(subset='Label')
asctb['Label'] = asctb['AS/1'].apply(str.capitalize) + '_' + \
                asctb['Label'].apply(lambda x: x[0].replace('/','')) + '_' + \
                asctb['Label'].apply(lambda x: x[1].replace('_', ' '))
asctb['Label'] = asctb['Label'].apply(lambda x: re.sub('\s\([^)]+\)', '', x))
asctb

In [None]:
gene_cols = asctb.columns[asctb.columns.map(lambda x: 'Gene' in x and 'LABEL' not in x and 'ID' not in x and 'ABBR' not in x and 'NOTE' not in x)]
gene_protein_cols = asctb.columns[asctb.columns.map(lambda x: ('Gene' in x or 'Protein' in x) and 'LABEL' not in x and 'ID' not in x and 'ABBR' not in x and 'NOTE' not in x)]

In [None]:
def get_all_genes(entry):
    genes = set()
    entry_gene_cols = entry[gene_cols].dropna()
    for gene_col in entry_gene_cols:
        genes.update(set(gene_col.split(', ')))
    if len(genes) == 0:
        return np.NaN
    return genes


def get_all_genes_and_proteins(entry):
    genes = set()
    entry_gene_cols = entry[gene_protein_cols].dropna()
    for gene_col in entry_gene_cols:
        genes.update(set(gene_col.split(', ')))
    if len(genes) == 0:
        return np.NaN
    return genes


In [None]:
asctb['Genes'] = asctb.apply(get_all_genes_and_proteins, axis=1)
asctb = asctb.dropna(subset='Genes')
asctb

In [None]:
geneinfo = pd.read_csv('../../../mapping/source_files/human_gene_info', sep='\t')
geneinfo = geneinfo[geneinfo['#tax_id']==9606][geneinfo['type_of_gene']=='protein-coding']
geneinfo['Synonyms'] = geneinfo['Synonyms'].apply(str.split, sep='|')
geneinfo = geneinfo.explode('Synonyms')[['GeneID', 'Symbol', 'Synonyms', 'description']]
geneinfo

In [None]:
symbols = set(geneinfo['Symbol'].tolist())
genedict = geneinfo.set_index('Synonyms')['Symbol'].to_dict()
genedict.pop('-')

for gene in symbols:
    genedict[gene] = gene

In [None]:
def clean(gene_label):
    gene_label = gene_label.split(',')[0].strip()
    gene_label = re.sub('[+-]$', '', gene_label)
    gene_label = re.sub('\s\([^)]+\)', '', gene_label)
    return gene_label

In [None]:
asctb = asctb.copy().explode('Genes')
asctb['Genes'] = asctb['Genes'].map(clean).map(genedict)
asctb = asctb.dropna(subset='Genes').drop_duplicates(subset=['Label', 'Genes'])
asctb = asctb[['Label', 'CTID', 'Genes']].dropna().reset_index(drop=True)
print(asctb['Genes'].nunique(), 'genes,', asctb['Label'].nunique(), 'cell types')
asctb

In [None]:
asctb.groupby('Label')['Genes'].count().mean()

## Prepare Data for SQL Ingestion

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)
(164, 'HuBMAP ASCT+B Annotations', 'ASCT+B Annotations', 'Anatomical structure and cell type biomarker annotations from the HuBMAP ASCT+B tables', 'gene-cell type associations from curated genetic association studies', 'biomarker genes for the {0} cell type from the HuBMAP ASCT+B dataset.', 'sets of biomarker genes for cell types from the HuBMAP ASCT+B dataset.', 'cell types associated with {0} gene from the HuBMAP ASCT+B dataset.', 0, 0, '2024-12-26', 'asctb', 0, 111, 16, 7, 2, 1, 'association by literature curation', 'curated literature', 'low throughput, hypothesis driven', 'cell types')

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(160, 'Jain, S et al. (2023) Advances and prospects for the Human BioMolecular Atlas Program (HuBMAP). Nat Cell Biol. 25(8):1089-100', 'Jain, Nat Cell Biol, 2024', 'dx.doi.org/10.1038/s41556-023-01194-w', 37468756, 'https://ncbi.nlm.nih.gov/pubmed/37468756', 'Jain', 'S', 'Nat Cell Biol', 2023, 'Advances and prospects for the Human BioMolecular Atlas Program (HuBMAP)', '25', '1089-100')
(249, 164, 160)

### Attributes

In [None]:
celltypes['Upper'] = celltypes['Label'].map(str.upper)
celltypes.drop_duplicates('Upper')

In [None]:
index = 438991
attributefks = {}
celltypes = asctb[['Label', 'CTID']].drop_duplicates()
celltypes['Upper'] = celltypes['Label'].map(str.upper)
celltypes = celltypes.drop_duplicates('Upper').drop(columns=['Upper'])

for i in celltypes.index:
    label = celltypes.loc[i, 'Label']
    ctid = celltypes.loc[i, 'CTID']
    print((index, label, ctid, 105), end=',\n')
    attributefks[label] = index
    index += 1

### Gene Sets

In [None]:
index = 136500000
genesetfks = {}
url = 'http://purl.obolibrary.org/obo/'

for celltype in celltypes.index:
    label = celltypes.loc[celltype, 'Label']
    ctid = celltypes.loc[celltype, 'CTID']
    print((index, label, ctid, f'{url}{ctid.replace(":","_")}', 164, 2, attributefks[label]), end=',\n')
    genesetfks[label.upper()] = index
    index += 1

### Associations

In [None]:
genes = pd.read_csv('../../../tables/gene.csv')
genes['symbol'] = genes['symbol'].apply(str.upper)
geneids = genes.set_index('symbol')['ncbi_entrez_gene_id'].to_dict()
genefks = genes.set_index('symbol')['id']

In [None]:
associations = asctb.copy()
associations['Label'] = associations['Label'].map(str.upper).map(genesetfks)
associations['Genes'] = associations['Genes'].map(str.upper).map(genefks).astype(int)
associations = associations[['Genes', 'Label']]
associations.columns = ['gene_fk', 'gene_set_fk']
associations['threshold_value'] = 1
associations = associations.drop_duplicates().reset_index(drop=True)
associations.index += 164000000
associations = associations.rename_axis('id')
associations.to_csv('../../../harmonizome-update/asctb.csv')
associations

## Create Downloads

In [None]:
output_path = 'downloads/'

In [None]:
asctb['Upper'] = asctb['Label'].map(str.upper)
asctb = asctb.drop_duplicates(['Upper', 'Genes']).drop('Upper', axis=1).reset_index(drop=True)
asctb['Gene ID'] = asctb['Genes'].map(str.upper).map(geneids)
asctb = asctb[['Genes', 'Gene ID', 'Label', 'CTID']]
asctb.columns = ['Gene', 'Gene ID', 'Cell Type', 'Cell Type ID']
asctb['Threshold'] = 1
asctb

### Gene Attribute Binary Matrix

In [None]:
binarymatrix = pd.crosstab(asctb['Gene'], asctb['Cell Type'], asctb['Threshold'], aggfunc=max).replace(np.nan, 0).astype(int)
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene Attribute Edge List

In [None]:
edgelist = asctb.copy()
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Cell Type', 'Cell Type ID']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for celltype in attributeslist.index:
    celltype = attributeslist.loc[celltype]
    nodes[celltype['Cell Type ID']] = {
        "type":"cell type",
        "properties": {
            "id":celltype['Cell Type ID'],
            "label":celltype['Cell Type']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": "is marker for",
        "target": edge['Cell Type ID'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Cell Type ID'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_label":edge['Cell Type'],
            "target_id":edge['Cell Type ID'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/asctb.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix CL: http://purl.obolibrary.org/obo/CL_', file=f)
    print('@prefix PCL: http://purl.obolibrary.org/obo/PCL_', file=f)
    
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0002607', edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/asctb.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
def namespace(nodeid):
    if 'PCL' in nodeid:
        return 'Provisional Cell Ontology'
    elif 'CL' in nodeid:
        return 'Cell Ontology'
    return 'NCBI Entrez'

nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['id'].astype(str).apply(namespace)
#nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'pathway':'Reactome'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/asctb_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/asctb_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0, xticklabels=False, yticklabels=False)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, xticklabels=False, yticklabels=False)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors, use_rep='X')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the HuBMAP ASCT+B Annotations Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, 
     nneighbors=24,
     #mindist=0.1,
     spread=1.5,
     #maxdf=0.9,
     mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/asctb.html", title = 'Gene Sets in the HuBMAP ASCT+B Annotations Library')
save(plot)