# HuBMAP Azimuth Cell Type
This notebook contains the processing scripts for the HuBMAP Azimuth Cell Type Annotations dataset. Cell type annotations were gathered from the [Azimuth reference tables](https://azimuth.hubmapconsortium.org/references/) and converted into a list of 14,221 edges between 1426 cell types and 4305 genes.

In [None]:
import pandas as pd
import datetime
import math
import numpy as np
import os
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

In [None]:
azimuth = pd.read_csv('AzimuthReferences.tsv', sep='\t')
azimuth['cell'] = azimuth['Tissue']+' - ' + azimuth['Level'] + ' - ' + azimuth['Expanded Label']
azimuth['Markers'] = azimuth['Markers'].apply(lambda x: list(x.split(',')))
azimuth = azimuth.explode('Markers')[['cell', 'Markers']].reset_index(drop=True)
azimuth['Markers'] = azimuth['Markers'].apply(str.strip)
azimuth

In [None]:
len(azimuth['cell'].unique()), len(azimuth['Markers'].unique()), len(azimuth)

## Map Genes to Up-to-Date and Approved Gene Symbols

In [None]:
genemapping = pd.read_csv('../../mapping/mappingFile_2023.tsv', sep='\t', header=None, index_col=1)[2].to_dict()
geneinfo = pd.read_csv('../../tables/gene_info', sep='\t', index_col='Symbol')[['GeneID', 'description']]
geneinfo.index = geneinfo.index.map(str.upper)

In [None]:
azimuth['Markers'] = azimuth['Markers'].apply(str.upper).map(genemapping)
azimuth = azimuth[azimuth['Markers'].apply(lambda x: x in geneinfo.index)]
azimuth

In [None]:
len(azimuth['cell'].unique()), len(azimuth['Markers'].unique()), len(azimuth)

## Process Data for SQL

### Resource

In [None]:
#(id, name, acronym, long_description, short_description, url, num_attributes, num_datasets)
(111, 'The Human BioMolecular Atlas Program', 'HuBMAP', 'The Human BioMolecular Atlas Program is a consortium composed of diverse research teams funded by the Common Fund at the National Institutes of Health to accelerate understanding of the relationships between cell and tissue organization and function and human health.', 'A consortium developing the tools to create an open, global atlas of the human body at the cellular level.', 'https://hubmapconsortium.org/', 1426, 1)

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)
(153, 'HuBMAP Azimuth Cell Type Annotations', 'Azimuth Cell Type Annotations', 'gene-cell type annotations from integrated reference scRNA-seq gene expression profiles', 'gene-cell type associations by differential expression of gene across cell types', 'genes with high or low expression in {0} relative to other cell types from the HuBMAP Azimuth Cell Type Annotations dataset.', 'sets of genes with high or low expression in each cell type relative to other cell types from the HuBMAP Azimuth Cell Type Annotations dataset.', 'cell types with high or low expression of {0} gene relative to other cell types from the HuBMAP Azimuth Cell Type Annotations dataset.', 0, 0, '2023-11-28', 'azimuth', 0, 111, 16, 7, 2, 1, 'gene expression by RNA-seq', 'curated experimental data', 'high throughput, data driven', 'cell types', 0)

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(151, 'Stuart, T et al. (2019) Comprehensive Integration of Single-Cell Data. Cell. 177:1888-902', 'Stuart, Cell, 2019', 'dx.doi.org/10.1016/j.cell.2019.05.031', '31178118', 'https://ncbi.nlm.nih.gov/pubmed/31178118/', 'Stuart', 'T', 'Cell', 2019, 'Comprehensive Integration of Single-Cell Data', 177, '1888-902')

### Genes

In [None]:
genes = pd.read_csv('../../tables/gene.csv')
geneslist = genes['symbol'].to_list()
genefks = genes.set_index('symbol')['id']

index=58321

for gene in azimuth['Markers'].unique():
    if gene not in geneslist:
        print((index, gene, geneinfo.loc[gene, 'GeneID'], geneinfo.loc[gene, 'description'], 'https://ncbi.nlm.nih.gov/gene/'+str(geneinfo.loc[gene, 'GeneID'])), end=',\n')
        genefks[gene] = index
        index += 1

### Naming Authority

In [None]:
#(id, name, acronym, description, url, publication_fk)
(105, 'The Human BioMolecular Atlas Program', 'HuBMAP','A consortium developing the tools to create an open, global atlas of the human body at the cellular level.', 'https://hubmapconsortium.org/', 151)

### Attributes

In [None]:
index = 392747
attributefks = {}

for cell in azimuth['cell'].unique():
    print((index, cell, 105), end=',\n')
    attributefks[cell] = index
    index += 1

### Gene Sets

In [None]:
index = 135400000
genesetfks = {}

for cell in azimuth['cell'].unique():
    print((index, cell, 153, 2, attributefks[cell]), end=',\n')
    attributefks[cell] = index
    genesetfks[cell] = index
    index += 1

### Associations

In [None]:
associations = azimuth.copy()
associations = associations[['Markers', 'cell']]
associations.columns = ['gene_fk', 'gene_set_fk']
associations['gene_fk'] = associations['gene_fk'].map(genefks)
associations['gene_set_fk'] = associations['gene_set_fk'].map(genesetfks)
associations['threshold_value'] = 1
associations.index += 40000000
associations.to_csv('../../harmonizome-update/azimuth.csv')
associations

# Create Downloads

In [None]:
output_path = 'downloads/'
geneids = geneinfo['GeneID'].to_dict()
azimuth['Gene ID'] = azimuth['Markers'].map(geneids)
azimuth['threshold'] = 1
azimuth = azimuth[['Markers', 'Gene ID', 'cell', 'threshold']]
azimuth.columns = ['Gene', 'Gene ID', 'Cell Type', 'Threshold']
azimuth

### Gene Attribute Ternary Matrix

In [None]:
binarymatrix = pd.crosstab(azimuth['Gene'], azimuth['Cell Type'], azimuth['Threshold'], aggfunc=max).replace(np.nan, 0)
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene Attribute Edge List

In [None]:
edgelist = azimuth.copy()
edgelist.columns = ['Gene', 'Gene ID', 'Cell Type', 'Threshold Value']
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Cell Type']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for agingsig in attributeslist.index:
    agingsig = attributeslist.loc[agingsig]
    nodes[agingsig['Cell Type']] = {
        "type":"cell type",
        "properties": {
            "label":agingsig['Cell Type'],
            "id":agingsig['Cell Type']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": "over-expressed in",
        "target": edge['Cell Type'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Cell Type'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_id":edge['Cell Type'],
            "target_label":edge['Cell Type'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/azimuth.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)

    print('', file=f)
    for edge in edges:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002245', edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/azimuth.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].map({'gene':'NCBI Entrez', 'cell type':'HuBMAP'})
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/azimuth_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/azimuth_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0, figsize=(25,25))

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, figsize=(25,25))

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads/gene_set_library_crisp.gmt', 'r'))

scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the HuBMAP Azimuth Cell Type Annotations Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=30,mindist=0.1
     ,spread=1.5
     ,maxdf=.8
     #,mindf=1
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/azimuth.html", title = 'Gene Sets in the HuBMAP Azimuth Cell Type Annotations Library')
save(plot)