# Tabula Sapiens Gene-Cell Associations

[Tabula Sapiens](https://tabula-sapiens-portal.ds.czbiohub.org/) provides human transcriptomics data at a single-cell resolution. The resource is a benchmark, first-draft human cell atlas of nearly 500,000 cells from 24 organs of 15 normal human subjects. Raw download data, in the format of single-cell RNAseq, was processed to aggregate synonymous samples using metadata alignment into pseudo-bulk RNAseq data.

The Tabula Sapiens Consortium (2022). "The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans." Science 376(6594).

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn
import sys
import json
import scanpy as sc

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from bokeh.io import output_notebook, export_svg, output_file, save
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
sys.setrecursionlimit(100000)

In [None]:
output_notebook()

## Load Data

In [None]:
dir = 'newdata/TabulaSapiens/'
tabsap = pd.read_pickle(dir+'picklesbydonors/ts_10x_rem-dups_cell-ontology-class_tissue_master-sumsv1')
tabsap

## Pre-Process Data

In [None]:
genes = pd.read_csv('tables/gene_info', sep='\t')
genes = genes[genes['type_of_gene']=='protein-coding']
genelist = genes['Symbol'].to_list()
syngenes = pd.read_csv('production/gene_synonym.csv')['symbol'].to_list()

In [None]:
droplist = []
for gene in pd.unique(tabsap.index.tolist()):
    if gene not in genelist or gene in syngenes:
        droplist.append(gene)

In [None]:
tabsap = tabsap.drop(droplist, axis=0)
tabsap

## Process Data

In [None]:
edgelist = pd.DataFrame(tabsap.stack(), dtype=int)
edgelist = edgelist.sparse.to_dense().groupby(level=[0,1]).median().astype(int).reset_index()
edgelist = edgelist[edgelist[0] != 0].reset_index(drop=True)
edgelist.columns = ['gene', 'cell', 'count']
print(len(edgelist.gene.unique()), ' genes, ', len(edgelist.cell.unique()), ' cell types', sep='')
edgelist

In [None]:
def z(gene):
    mean = genemeans[gene['gene']]
    std = genedevs[gene['gene']]
    if std == 0:
        return 0
    return (gene['count']-mean)/std

In [None]:
genestats = edgelist.groupby('gene').describe().replace(np.nan, 0)
genemeans = genestats['count']['mean'].to_dict()
genedevs = genestats['count']['std'].to_dict()
edgelist['z'] = edgelist.apply(z,axis=1)
edgelist

In [None]:
edgelist = edgelist.sort_values(['cell', 'z'], ascending=[True, False])
edgelist = edgelist.set_index('cell')
top = pd.DataFrame(columns=edgelist.columns)
for cell in pd.unique(edgelist.index):
    top = pd.concat([top, edgelist.loc[cell][:100]])
edgelist = edgelist.reset_index()
top

In [None]:
genesets = {}
for cell in top.index.unique():
    genesets[cell] = ' '.join(top.loc[cell]['gene'].tolist())

In [None]:
edgelist = top.reset_index()
edgelist.columns = ['cell','gene','count','z']
print(edgelist.gene.unique().__len__(), 'genes,',edgelist.cell.unique().__len__(),'cell types')
edgelist

## Harmonizome Additions

### Resource

In [None]:
(104,
'Tabula Sapiens',
None,
'Tabula Sapiens is a benchmark, first-draft human cell atlas of nearly 500,000 cells from 24 organs of 15 normal human subjects.',
'Human transcriptome reference at single cell resolution',
'https://tabula-sapiens-portal.ds.czbiohub.org/',
'306',
'1',
None)

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(134,
'Tabula Sapiens Gene-Cell Associations',
'Gene-Cell Associations',
'Gene expression counts for single human cells',
'gene-cell type associations by differential expression of gene across cell types ',
'genes with high or low expression in {0} relative to other cell types from the Tabula Sapiens Gene-Cell Associations dataset.',
'sets of genes with high or low expression in each cell type relative to other tissue samples from the Tabula Sapiens Gene-Cell Associations dataset.',
'cell types with high or low expression of {0} gene relative to other cell types from the Tabula Sapiens Gene-Cell Associations dataset.',
0,
1,
'2022-12-1',
'tabulasapiens',
0,
104,
16,
7,
2,
1,
'gene expression by RNA-seq',
'primary experimental data',
'measurement_bias',
'cell types')

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(137, 'Consortium, Tabula Sapiens et al. (2022). "The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans." Science 376(6594).', 'Tabula Sapiens, Science, 2022', 'dx.doi.org/10.1126/science.abl4896', 35549404, 'https://pubmed.ncbi.nlm.nih.gov/35549404/', 'Consortium', 'Tabula Sapiens', 'Science', 2022, 'The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans', 376, 6594)

### Genes To Add

In [None]:
#(id,symbol,ncbi_entrez_gene_id,name,ncbi_entrez_gene_url)
geneinfo = genes.get(['GeneID', 'Symbol', 'description']).set_index('Symbol')
url = 'https://ncbi.nlm.nih.gov/gene/'
index = 57195
newgenes = pd.DataFrame(columns=[0, 1, 2, 3])
prodgenes = pd.read_csv('production/gene_updated.csv')['symbol'].to_list()
for gene in edgelist['gene'].unique().tolist():
    if gene.upper() not in prodgenes:
        gene = geneinfo.loc[gene]
        newgenes = pd.concat([newgenes,pd.DataFrame((index, gene.name.upper(), gene['GeneID'], gene['description'])).T])
        index+=1
newgenes.columns = columns=['id', 'symbol', 'ncbi_entrez_gene_id', 'name']
newgenes['url'] = newgenes['ncbi_entrez_gene_id'].apply(lambda x: url+str(x))
newgenes = newgenes.reset_index(drop=True)

for gene in newgenes.index:
    gene = newgenes.loc[gene]
    print((gene['id'],gene['symbol'],gene['ncbi_entrez_gene_id'],gene['name'],gene['url']), end=',\n')

### Naming Authority

In [None]:
(100, 'Tabula Sapiens', 'TS', 'The Tabula Sapiens Consortium used single-cell transcriptomics to measure the messenger RNA molecules in each of nearly 500,000 cells from 24 tissues and organs.', 'https://tabula-sapiens-portal.ds.czbiohub.org/', 137)

### Attributes To Add

In [None]:
celltypes = pd.read_csv('tables/CL.csv')
celltypes = celltypes[celltypes['Obsolete']==False]
celltypes['Class ID'] = celltypes['Class ID'].apply(str.split, sep='/').str[-1]
celltypes['Namespace'] = celltypes['Class ID'].apply(str.split, sep='_').str[0]
celltypes = celltypes[celltypes['Namespace']=='CL']
celltypes = celltypes.get(['Class ID','Preferred Label','Synonyms']).reset_index(drop=True)
celltypes = celltypes.set_index('Preferred Label')['Class ID'].to_dict()

manualannotation = pd.read_csv('newdata/TabulaSapiens/tabsapannotation.csv', index_col='Unnamed: 0')
manualannotation = manualannotation[manualannotation['CLID']!='0']
manualannotation = manualannotation.set_index('Cell Type')['CLID'].to_dict()
manualannotation

celltypes.update(manualannotation)

In [None]:
attributes = pd.DataFrame(edgelist['cell'].unique())
attributes['tissue'] = attributes[0].apply(str.split, sep='-').str[0]
attributes['cell'] = attributes[0].apply(str.split, sep='-').str[1]#:].apply('-'.join)
attributes['map'] = attributes['cell'].apply(lambda x: x in celltypes)
attributes['id'] = 'unmapped'
for attribute in attributes.index:
    if attributes.loc[attribute, 'map']:
        attributes.loc[attribute, 'id'] = celltypes[attributes.loc[attribute,'cell']]
attributes.index += 297886
attributes = attributes.reset_index()
attributes.index += 133300000
attributes = attributes.reset_index()
attributes.columns = ['gene_set_id', 'attribute_id', 'name', 'tissue', 'cell', 'map', 'id']
attributes

In [None]:
for attribute in attributes.index:
    attribute = attributes.loc[attribute]
    #print((attribute['attribute_id'], attribute['name'], '-'.join(attribute['name'].split(sep='-')[1:])+' from '+' '.join(attribute['tissue'].split(sep='_')).lower()+' associated with '+attribute['id'], 100), end=',\n')

### Gene Sets to Add

In [None]:
for attribute in attributes.index:
    attribute = attributes.loc[attribute]
    #print((attribute['gene_set_id'], attribute['name'], '-'.join(attribute['name'].split(sep='-')[1:])+' from '+' '.join(attribute['tissue'].split(sep='_')).lower()+' associated with '+attribute['id'], 134, 2, attribute['attribute_id']), end=',\n')

### Associations to Add

In [None]:
genedict = pd.read_csv('production/gene_updated.csv').set_index('symbol').get(['id', 'ncbi_entrez_gene_id']).to_dict()
newgenes = newgenes.set_index('symbol').get(['id', 'ncbi_entrez_gene_id']).to_dict()
for dictionary in genedict:
    genedict[dictionary].update(newgenes[dictionary])
genesetdict = attributes.set_index('name')['gene_set_id'].to_dict()

In [None]:
edgelist['gene_fk'] = edgelist['gene'].apply(str.upper).apply(lambda x: genedict['id'][x])
edgelist['gene_id'] = edgelist['gene'].apply(str.upper).apply(lambda x: genedict['ncbi_entrez_gene_id'][x])
edgelist['gene_set_fk'] = edgelist['cell'].apply(lambda x: genesetdict[x])
edgelist

In [None]:
associations = edgelist.get(['gene_fk', 'gene_set_fk', 'count', 'z'])
associations['threshold'] = 1
associations.index += 13000000
associations.to_csv('harmonizome-update/tabulasapiens.csv')
associations

## Download Files

In [None]:
output_path = 'newdata/TabulaSapiens/downloads/'

### Gene-Attribute Matrix

In [None]:
binarymatrix = pd.crosstab(edgelist['gene'], edgelist['cell'], 1, aggfunc=max).replace(np.nan, 0).astype(int)
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
attributesdict = attributes.set_index('name')['id'].to_dict()

In [None]:
edgelist['cell_id'] = edgelist['cell'].apply(lambda x: attributesdict[x])
edgelist['threshold']=1
edgelist = edgelist.get(['gene', 'gene_id', 'cell', 'cell_id', 'count', 'z', 'threshold'])
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
genelist = edgelist.get(['gene', 'gene_id']).drop_duplicates().reset_index(drop=True)
genelist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
genelist

### Attribute List

In [None]:
attributelist = edgelist.get(['cell', 'cell_id']).drop_duplicates().reset_index(drop=True)
attributelist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributelist

### Gene Set Library

In [None]:
arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(binarymatrix.columns[i],*binarymatrix.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)

with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    for i in range(arr.shape[1]):
        print(binarymatrixT.columns[i],*binarymatrixT.index[arr[:,i]==1],
            sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

gene_similarity_matrix = pd.DataFrame(similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None

gene_similarity_matrix.to_csv(output_path+'genee_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
similarity_matrix = dist.squareform(similarity_matrix)
similarity_matrix = 1 - similarity_matrix

attribute_similarity_matrix = pd.DataFrame(similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None

attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Gene-Attribute Cleaned Matrix

In [None]:
cleanbinarymatrix = pd.crosstab(edgelist['gene'], edgelist['cell'], edgelist['count'], aggfunc=max).replace(np.nan, 0).astype(int)
cleanbinarymatrix.to_csv(output_path+'gene_attribute_matrix_cleaned.txt.gz', sep='\t', compression='gzip')
cleanbinarymatrix

### Gene-Attribute Standardized Matrix

In [None]:
standardbinarymatrix = pd.crosstab(edgelist['gene'], edgelist['cell'], edgelist['z'], aggfunc=max).replace(np.nan, 0)
standardbinarymatrix.to_csv(output_path+'gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardbinarymatrix

### Knowledge Graph Serializations

In [None]:
nodes = {}
edges = []

for gene in genelist.index:
    gene = genelist.loc[gene]
    nodes[gene['gene']] = {
        "type":"gene",
        "properties": {
            "id":int(gene['gene_id']),
            "label":gene['gene']
        }}

for celltype in attributelist.index:
    celltype = attributelist.loc[celltype]
    nodes[celltype['cell']] = {
        "type":"cell",
        "properties": {
            "id":celltype['cell_id'],
            "label":celltype['cell']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": edge['gene'],
        "relation": "expressed in",
        "target": edge['cell'],
        "properties":{
            "id":edge['gene']+":"+edge['cell'],
            "source_id":int(edge['gene_id']),
            "source_label":edge['gene'],
            "target_label":edge['cell'],
            "target_id":edge['cell_id'],
            "directed":True,
            "count":int(edge['count']),
            "z":edge['z'],
            "threshold":1
        }})

RDF

In [None]:
with open(output_path+'serializations/tabulasapiens.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0002206', '<'+edge['target']+'>', end=' .\n', file=f)

JSON

In [None]:
with open(output_path+'serializations/tabulasapiens.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'cell':'Cell Ontology'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'serializations/tabulasapiens_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['count'] = edgeframe['properties'].apply(lambda x: x['count'])
edgeframe['z'] = edgeframe['properties'].apply(lambda x: x['z'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'count', 'z', 'threshold'])
edgeframe.to_csv(output_path+'serializations/tabulasapiens_tsv/edges.tsv', sep='\t')
edgeframe

## Visualizations

In [None]:
seaborn.clustermap(binarymatrix, cmap='seismic', center=0)

In [None]:
seaborn.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

In [None]:
seaborn.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
vec = TfidfVectorizer(max_df=0.5, min_df=10)
X = vec.fit_transform(genesets.values())
adata = anndata.AnnData(X, dtype='float32')
adata.obs.index = genesets.keys()

sc.pp.neighbors(adata, n_neighbors=25, use_rep='X')
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.18, spread=3)

new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order,:]
adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

mapped_df = pd.DataFrame(adata.obsm['X_umap'])
mapped_df.columns = ['x', 'y']

mapped_df['cluster'] = adata.obs['leiden'].values
mapped_df['term'] = adata.obs.index

clusters = pd.unique(mapped_df['cluster']).tolist()
colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
color_mapper = {clusters[i]:colors[i%20] for i in range(len(clusters))}

mapped_df['color'] = mapped_df['cluster'].apply(lambda x: color_mapper[x])

xlabel = 'UMAP 1'
ylabel = 'UMAP 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            colors = mapped_df['color'], 
            size = [6] * mapped_df.shape[0],
            gene_set = mapped_df['term'],
            cluster = mapped_df['cluster']
        )
    )

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
            <span style="font-size: 12px">@cluster</span>
        </div>
    </div>
    """)

tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in Tabula Sapiens Library'
plot_emb = figure(plot_width=1000, plot_height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle( 'x', 'y', source = source2, size='size',
                alpha='alpha', line_alpha=0, line_width=0.01, name="df", 
                fill_color = 'colors', 
                line_color="black", hover_fill_color="firebrick")
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)