# Tabula Sapiens Gene-Cell Associations

[Tabula Sapiens](https://tabula-sapiens-portal.ds.czbiohub.org/) provides human transcriptomics data at a single-cell resolution. The resource is a benchmark, first-draft human cell atlas of nearly 500,000 cells from 24 organs of 15 normal human subjects. Raw download data, in the format of single-cell RNAseq, was processed to aggregate synonymous samples using metadata alignment into pseudo-bulk RNAseq data.

The Tabula Sapiens Consortium (2022). "The Tabula Sapiens: A multiple-organ, single-cell transcriptomic atlas of humans." Science 376(6594).

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import scanpy as sc
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from bokeh.io import output_notebook, export_svg, output_file, save
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20

In [None]:
output_notebook()

## Load Data

In [None]:
dir = 'newdata/TabulaSapiens/'
tabsap = pd.read_pickle(dir+'picklesbydonors/ts_10x_rem-dups_cell-ontology-class_tissue_master-sumsv1')
tabsap

## Pre-Process Data

In [None]:
genes = pd.read_csv('tables/gene_info', sep='\t')
genes = genes[genes['type_of_gene']=='protein-coding']
genelist = genes['Symbol'].to_list()
syngenes = pd.read_csv('production/gene_synonym.csv')['symbol'].to_list()

In [None]:
droplist = []
for gene in pd.unique(tabsap.index.tolist()):
    if gene not in genelist or gene in syngenes:
        droplist.append(gene)

In [None]:
tabsap = tabsap.drop(droplist, axis=0)
tabsap

## Process Data

In [None]:
edgelist = pd.DataFrame(tabsap.stack(), dtype=int)
edgelist = edgelist.sparse.to_dense().groupby(level=[0,1]).median().astype(int).reset_index()
edgelist = edgelist[edgelist[0] != 0].reset_index(drop=True)
edgelist.columns = ['gene', 'cell', 'count']
print(len(edgelist.gene.unique()), ' genes, ', len(edgelist.cell.unique()), ' cell types', sep='')
edgelist

In [None]:
def z(gene):
    mean = genemeans[gene['gene']]
    std = genedevs[gene['gene']]
    if std == 0:
        return 0
    return (gene['count']-mean)/std

In [None]:
genestats = edgelist.groupby('gene').describe().replace(np.nan, 0)
genemeans = genestats['count']['mean'].to_dict()
genedevs = genestats['count']['std'].to_dict()
edgelist['z'] = edgelist.apply(z,axis=1)
edgelist

In [None]:
edgelist = edgelist.sort_values(['cell', 'count'], ascending=[True, False])
edgelist = edgelist.set_index('cell')
topcount = pd.DataFrame(columns=edgelist.columns)
for cell in pd.unique(edgelist.index):
    topcount = pd.concat([topcount, edgelist.loc[cell][:100]])
edgelist = edgelist.reset_index()
topcount

In [None]:
edgelist = edgelist.sort_values(['cell', 'z'], ascending=[True, False])
edgelist = edgelist.set_index('cell')
top100 = pd.DataFrame(columns=edgelist.columns)
for cell in pd.unique(edgelist.index):
    top100 = pd.concat([top100, edgelist.loc[cell][:100]])
edgelist = edgelist.reset_index()
top100

In [None]:
edgelist = edgelist.sort_values(['cell', 'z'], ascending=[True, False])
edgelist = edgelist.set_index('cell')
top250 = pd.DataFrame(columns=edgelist.columns)
for cell in pd.unique(edgelist.index):
    top250 = pd.concat([top250, edgelist.loc[cell][:250]])
edgelist = edgelist.reset_index()
top250

In [None]:
edgelist = edgelist.sort_values(['cell', 'z'], ascending=[True, False])
edgelist = edgelist.set_index('cell')
top500 = pd.DataFrame(columns=edgelist.columns)
for cell in pd.unique(edgelist.index):
    top500 = pd.concat([top500, edgelist.loc[cell][:500]])
edgelist = edgelist.reset_index()
top500

In [None]:
genesets = {}
for cell in top100.index.unique():
    genesets[cell] = ' '.join(top100.loc[cell]['gene'].tolist())

In [None]:
top100.gene.unique().__len__(), top100.index.unique().__len__()

In [None]:
vec = TfidfVectorizer(max_df=0.5, min_df=10)
X = vec.fit_transform(genesets.values())
adata = anndata.AnnData(X, dtype='float32')
adata.obs.index = genesets.keys()

sc.pp.neighbors(adata, n_neighbors=25, use_rep='X')
sc.tl.leiden(adata)
sc.tl.umap(adata, min_dist=0.18, spread=3)

new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order,:]
adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

mapped_df = pd.DataFrame(adata.obsm['X_umap'])
mapped_df.columns = ['x', 'y']

mapped_df['cluster'] = adata.obs['leiden'].values
mapped_df['term'] = adata.obs.index

clusters = pd.unique(mapped_df['cluster']).tolist()
colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
color_mapper = {clusters[i]:colors[i%20] for i in range(len(clusters))}

mapped_df['color'] = mapped_df['cluster'].apply(lambda x: color_mapper[x])

xlabel = 'UMAP 1'
ylabel = 'UMAP 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            colors = mapped_df['color'], 
            size = [6] * mapped_df.shape[0],
            gene_set = mapped_df['term'],
            cluster = mapped_df['cluster']
        )
    )

hover_emb = HoverTool(name="df", tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
            <span style="font-size: 12px">@cluster</span>
        </div>
    </div>
    """)

tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in Tabula Sapiens Library'
plot_emb = figure(width=1000, height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle( 'x', 'y', source = source2, size='size',
                alpha='alpha', line_alpha=0, line_width=0.01, name="df", 
                fill_color = 'colors', 
                line_color="black", hover_fill_color="firebrick")
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)

## Test Against Descartes Dataset

In [None]:
descartes = pd.DataFrame(columns = [0,1])
f = open('newdata/TabulaSapiens/Descartes_Cell_Types_and_Tissue_2021.txt')
for i in enumerate(f):
  i = i[1].split('\n')[0].split('\t')[:-1]
  descartes = pd.concat([descartes, pd.DataFrame([i[0], i[2:]]).T])

descartes.columns = ['cell','genes']
descartes = descartes.set_index('cell')
descartes['len'] = descartes['genes'].apply(len)
descartes

In [None]:
tscells = ['Eye-microglial cell',
'Pancreas-pancreatic ductal cell',
'Eye-retinal ganglion cell',
'Thymus-medullary thymic epithelial cell',
'Eye-corneal epithelial cell',
'Liver-hepatocyte',
'Liver-erythrocyte',
'Muscle-erythrocyte',
'Spleen-erythrocyte',
'Spleen-innate lymphoid cell',
'Muscle-skeletal muscle satellite stem cell',
'Heart-smooth muscle cell',
'Muscle-smooth muscle cell',
'Eye-limbal stromal cell',
'Lung-lung microvascular endothelial cell',
'Thymus-endothelial cell of artery',
'Eye-retinal bipolar neuron',
'Lung-mesothelial cell',
'Spleen-cd141-positive myeloid dendritic cell',
'Thymus-thymocyte']

In [None]:
topcount = topcount.T.get(tscells).T
tscount = pd.DataFrame(columns=['gene'])
for cell in topcount.index.unique():
    tscount.loc[cell,'gene'] = topcount.loc[cell].gene.tolist()
tscount

In [None]:
top100 = top100.T.get(tscells).T
ts100 = pd.DataFrame(columns=['gene'])
for cell in top100.index.unique():
    ts100.loc[cell,'gene'] = top100.loc[cell].gene.tolist()
ts100

In [None]:
top250 = top250.T.get(tscells).T
ts250 = pd.DataFrame(columns=['gene'])
for cell in top250.index.unique():
    ts250.loc[cell,'gene'] = top250.loc[cell].gene.tolist()
ts250

In [None]:
top500 = top500.T.get(tscells).T
ts500 = pd.DataFrame(columns=['gene'])
for cell in top500.index.unique():
    ts500.loc[cell,'gene'] = top500.loc[cell].gene.tolist()
ts500

In [None]:
ts = pd.concat([tscount, ts100,ts250,ts500],axis=1)
ts.columns=['tscount','ts100','ts250','ts500']
ts = ts.reset_index()
ts['index'] = ts['index'].apply(str.lower)
ts = ts.set_index('index')
ts

In [None]:
tsenrichr = pd.DataFrame(columns = [0,1])
f = open('newdata/TabulaSapiens/Tabula_Sapiens.txt')
for i in enumerate(f):
  i = i[1].split('\n')[0].split('\t')[:-1]
  tsenrichr = pd.concat([tsenrichr, pd.DataFrame([i[0], i[2:]]).T])

tsenrichr.columns = ['cell','genes']
tsenrichr = tsenrichr.set_index('cell')
tsenrichr = tsenrichr.reset_index()
tsenrichr['cell'] = tsenrichr['cell'].apply(str.lower)
tsenrichr = tsenrichr.set_index('cell')
tsenrichr = tsenrichr.T.get(ts.index.to_list()).T
tsenrichr.columns = ['tsenrichr']
ts = pd.concat([ts,tsenrichr], axis=1)
ts

In [None]:
descartes = descartes.T.get([
'Microglia in Eye',
'Ductal cells in Pancreas',
'Ganglion cells in Eye',
'Thymic epithelial cells in Thymus',
'Corneal and conjunctival epithelial cells in Eye',
'Hepatoblasts in Liver',
'Erythroblasts in Liver',
'Erythroblasts in Muscle',
'Erythroblasts in Spleen',
'Lymphoid cells in Spleen',
'Skeletal muscle cells in Muscle',
'Smooth muscle cells in Heart',
'Smooth muscle cells in Muscle',
'Stromal cells in Eye',
'Vascular endothelial cells in Lung',
'Vascular endothelial cells in Thymus',
'Bipolar cells in Eye',
'Mesothelial cells in Lung',
'Myeloid cells in Spleen',
'Thymocytes in Thymus'
]).T
descartes

In [None]:
def jaccard(frame):
    jaccard = pd.Series()
    for i in frame.index:
        list1 = frame.loc[i][0]
        list2 = frame.loc[i][1]
        jaccard = pd.concat([jaccard, pd.Series(len(np.intersect1d(list1,list2))/len(np.union1d(list1,list2)))])
    return jaccard

In [None]:
benchmark = pd.DataFrame(columns=['descartes cell', 'ts cell', 'descartes', 'tsenrichr', 'ts100','ts250','ts500'])
benchmark['descartes cell'] = descartes.index
benchmark['ts cell'] = ts.index
benchmark['descartes'] = benchmark['descartes cell'].apply(lambda x: descartes.loc[x,'genes'])
benchmark['tsenrichr'] = benchmark['ts cell'].apply(lambda x: ts.loc[x,'tsenrichr'])
benchmark['tscount'] = benchmark['ts cell'].apply(lambda x: ts.loc[x,'tscount'])
benchmark['ts100'] = benchmark['ts cell'].apply(lambda x: ts.loc[x,'ts100'])
benchmark['ts250'] = benchmark['ts cell'].apply(lambda x: ts.loc[x,'ts250'])
benchmark['ts500'] = benchmark['ts cell'].apply(lambda x: ts.loc[x,'ts500'])
benchmark

In [None]:
jaccards = benchmark.copy()
jaccards = jaccards.set_index('ts cell')
for i in benchmark.get(benchmark.columns[3:8]).columns:
    jaccards['descartes/'+i] = jaccard(benchmark.get(['descartes',i])).to_frame().set_index(jaccards.index)
jaccards = jaccards.get(['descartes/tsenrichr','descartes/tscount','descartes/ts100','descartes/ts250','descartes/ts500'])
jaccards = jaccards.sort_values('descartes/tsenrichr')
jaccards

In [None]:
stats = pd.DataFrame(columns=jaccards.columns).get(['descartes/tsenrichr','descartes/ts100','descartes/ts250','descartes/ts500'])
stats.loc['min'] = jaccards.apply(min,axis=0)
stats.loc['mean'] = jaccards.apply(np.mean, axis=0)
stats.loc['max'] = jaccards.apply(max, axis=0)
stats.columns=['Descartes/TSEnrichr','Descartes/TS100','Descartes/TS250','Descartes/TS500']
stats = sns.scatterplot(data=stats.T, markers=['o','o','o'])
stats.set_title('Descartes-Tabula Sapiens Gene Set Similarity Statistics')
stats.set_ylabel('Jaccard Index')
plt.xticks(rotation=45, ha='right')
stats

In [None]:
#jaccards = jaccards.get(['descartes/tsenrichr','descartes/ts100','descartes/ts250','descartes/ts500'])
#jaccards.columns = ['Descartes/TSEnrichr', 'Descartes/TS100', 'Descartes/TS250','Descartes/TS500']
jaccard = sns.scatterplot(data=jaccards, markers=['o','o','o','o'])
plt.axhline(jaccards['Descartes/TSEnrichr'].mean(), ls='--')
plt.axhline(jaccards['Descartes/TS100'].mean(), c='orange', ls='--')
plt.axhline(jaccards['Descartes/TS250'].mean(), c='green', ls='--')
plt.axhline(jaccards['Descartes/TS500'].mean(), c='red', ls='--')
jaccard.set_title('Descartes-Tabula Sapiens Gene Set Similarity Indices')
jaccard.set_ylabel('Jaccard Index')
plt.xticks(rotation=60, ha='right')
plt.ylim(bottom=0)
plt.legend(labels=['Descartes/TSEnrichr (mean=0.018)', 'Descartes/TS100 (mean=0.053)', 'Descartes/TS250 (mean=0.055)', 'Descartes/TS500 (mean=0.048)'])
plt.show()

In [None]:
jaccards.mean()