# Tables to Describe Tabula Sapiens

In [7]:
import os
import sys
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import numpy
import pickle
from scipy.sparse import issparse, csr_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def getFiles(category=None, model=None):
    h5ad_files = [] # list of all files
    targets    = [] # list of target
    for root, dirs, files in os.walk(DATAPATH):
        for file in files:
            if file.endswith('.h5ad'):
                h5ad_files.append(os.path.join(root, file))
    for root, dirs, files in os.walk(EMBPATH):
        for file in files:
            if file.endswith('.h5ad'):
                h5ad_files.append(os.path.join(root, file))
    for file in h5ad_files:
        if category is None and model in file:
            targets.append(file)
        elif model is None and category in file:
            targets.append(file)
        elif model == 'raw' and category in file:
            if 'geneformer' not in file and 'scgpt' not in file and 'Tgpt' not in file:
                targets.append(file)
        elif model is not None and category is not None and model in file and category in file:
            targets.append(file)
    return targets

def format_number(x):
    if isinstance(x, (float, np.float64)):
        return f"{x:.2g}"
    return x


## Assemble Data

In [2]:
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
EMBPATH  = "/nfs/turbo/umms-indikar/shared/projects/foundation_models/experiments/tabulaSapiens"

In [3]:
h5ad_files = []
for root, dirs, files in os.walk(DATAPATH):
    for file in files:
        if file.endswith('.h5ad'):
            h5ad_files.append(os.path.join(root, file))

tsCategories = []
for file in h5ad_files:
    tsCategories.append(file.split('.')[0].split('/')[-1])
tsCategories.remove('TabulaSapiens')
print(tsCategories)
models = ['raw', 'geneformer', 'Tgpt', 'scgpt']
print(models)

['TS_germ line', 'TS_stromal', 'TS_Large_Intestine', 'TS_Lymph_Node', 'TS_Skin', 'TS_epithelial', 'TS_Spleen', 'TS_Vasculature', 'TS_Kidney', 'TS_Salivary_Gland', 'TS_Thymus', 'TS_Pancreas', 'TS_Fat', 'TS_Bladder', 'TS_endothelial', 'TS_Blood', 'TS_Trachea', 'TS_Bone_Marrow', 'TS_Tongue', 'TS_Lung', 'TS_Liver', 'TS_Small_Intestine', 'TS_immune', 'TS_Uterus', 'TS_Eye', 'TS_Mammary', 'TS_Prostate', 'TS_Heart', 'TS_Muscle']
['raw', 'geneformer', 'Tgpt', 'scgpt']


In [4]:
dfData = pd.DataFrame(index = tsCategories)
dfData.index = dfData.index.map(lambda x: x[3:])
dfRep = pd.DataFrame(index = models)

In [5]:
data = []

for d in dfData.index:
    f = getFiles(category = 'TS_' + d, model='raw') #, model='geneformer') #_geneformer.h5ad
    print(f)
    adata = sc.read_h5ad(f[0])

    # Number of cells
    num_cells  = adata.shape[0]

    # female to mail ratio
    gender_distribution = adata.obs['gender'].value_counts()
    genders = list(gender_distribution.keys())
    if len(genders) == 2:
        fmr = gender_distribution.female / gender_distribution.male
    elif 'male' in genders:
        fmr = 'M'
    else:
        fmr = 'F'
    
    # unique molecular identifiers
    umi_mean = adata.obs['n_counts_UMIs'].mean()
    umi_median = adata.obs['n_counts_UMIs'].median()
    umi_std = adata.obs['n_counts_UMIs'].std()

    # Number of genes
    n_genes_mean = adata.obs['n_genes'].mean()
    n_genes_median = adata.obs['n_genes'].median()
    n_genes_std = adata.obs['n_genes'].std()
    
    # Anatomical information
    anatomy = ', '.join(adata.obs['anatomical_information'].unique())

    # Compartment
    compartment = ', '.join(adata.obs['compartment'].unique())

    # Methods
    methods = ', '.join(adata.obs['method'].unique())

    # Dataset
    ds = ', '.join(adata.obs['donor'].unique())

    # Append the collected data to the list
    data.append({
        'dataset': d,
        'num_cells': num_cells,
        'female_to_male_ratio': fmr,
        'umi_mean': umi_mean,
        'umi_median': umi_median,
        'umi_std': umi_std,
        'n_genes_mean': n_genes_mean,
        'n_genes_median': n_genes_median,
        'n_genes_std': n_genes_std,
        'anatomy': anatomy,
        'compartment': compartment,
        'methods': methods,
        'donor': ds
    })
    print(data[-1])

# Create a DataFrame from the collected data
df_results = pd.DataFrame(data)


['/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/TS_germ line.h5ad']
{'dataset': 'germ line', 'num_cells': 11, 'female_to_male_ratio': 'M', 'umi_mean': 13773.818, 'umi_median': 11515.0, 'umi_std': 7367.68798828125, 'n_genes_mean': 2836.3636363636365, 'n_genes_median': 2775.0, 'n_genes_std': 1062.8599411707332, 'anatomy': 'nan', 'compartment': 'germ line', 'methods': '10X', 'donor': 'TSP14'}
['/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/TS_stromal.h5ad']
{'dataset': 'stromal', 'num_cells': 82478, 'female_to_male_ratio': 0.8408213369043633, 'umi_mean': 69257.82, 'umi_median': 10464.0, 'umi_std': 351686.34375, 'n_genes_mean': 3095.35902907442, 'n_genes_median': 2803.0, 'n_genes_std': 1462.1518160320982, 'anatomy': 'nan, Supradiaphagmatic, Parotid, Posterior, Endometrium, Myometrium, MAT, SCAT, Atria, Ventricle, exocrine, abdomen, chest, atria, ventricle, noCornea, lacrimalgland, diaphragm, rectusabdominus, whole, anterior, posterior, Su

## Dataset summary table

In [11]:
df_results = pd.DataFrame(data)
df_results['anatomy list'] = df_results['anatomy'].apply(lambda x: x.split(','))
df_results['num. anatomy'] = df_results['anatomy list'].apply(lambda x: 0 if x[0] == 'nan' else len(x))
df_results = df_results.drop(['anatomy list', 'anatomy'], axis=1)
df_results['donor list'] = df_results['donor'].apply(lambda x: x.split(','))
df_results['num. donor'] = df_results['donor list'].apply(lambda x: 0 if x[0] == 'nan' else len(x))
df_results = df_results.drop(['donor list', 'donor'], axis=1)
df_results['compartment list'] = df_results['compartment'].apply(lambda x: x.split(','))
df_results['num. compartment'] = df_results['compartment list'].apply(lambda x: 0 if x[0] == 'nan' else len(x))
df_results = df_results.drop(['compartment list', 'compartment'], axis=1)
df_results['10X'] = df_results['methods'].apply(lambda x: '\checkmark' if '10X' in x else '')
df_results['smartseq2'] = df_results['methods'].apply(lambda x: '\checkmark' if 'smartseq2' in x else '')
df_results = df_results.drop(['methods'], axis=1)
df_results.columns = ['Dataset', '\# Cells', 'F/M', 'avg. UMI', 'med. UMI', 'std. UMI', 'avg. \# genes', 'med. \# genes', 'std. \# genes', '\# anatomy', '\# donors', '\# compartments', '10X', 'smartseq2']
df_results = df_results.applymap(format_number)


latex_table = df_results.to_latex(index=False, escape=False, float_format="%.3f")

custom_table = latex_table.replace(
    r'\begin{tabular}{lrlllllllllll}',
    r'\begin{tabular}{l|rrrrrrrrrrrr}'
).replace(
    r'\toprule',
    r'\hline'
).replace(
    r'\midrule',
    r'\hline\hline'
).replace(
    '_', ' '
).replace(
    r"""\begin{tabular}{lrlllllllrrrll}
\hline
Dataset & \# Cells & F/M & avg. UMI & med. UMI & std. UMI & avg. \# genes & med. \# genes & std. \# genes & \# anatomy & \# donors & \# compartments & 10X & smartseq2 \\
""",
    r"""\begin{tabular}{l|rrrrrrrrrrrrrr}
\multicolumn{2}{c}{}& & \multicolumn{3}{c}{UMI} & \multicolumn{3}{c}{Num. Genes} &\multicolumn{3}{c}{Categories}&\multicolumn{2}{c}{Assays} \\
\cmidrule(lr){4-6} \cmidrule(lr){7-9}\cmidrule(lr){10-12}\cmidrule(lr){13-14}
Dataset & \# Cells & F/M &avg.&med.&std.&avg.&med.&std.&\# Anatomy &\# Donors&\# Compartments&10X&smartseq2\\
    """
)

# Print the LaTeX table
print(custom_table)



\begin{tabular}{l|rrrrrrrrrrrrrr}
\multicolumn{2}{c}{}& & \multicolumn{3}{c}{UMI} & \multicolumn{3}{c}{Num. Genes} &\multicolumn{3}{c}{Categories}&\multicolumn{2}{c}{Assays} \\
\cmidrule(lr){4-6} \cmidrule(lr){7-9}\cmidrule(lr){10-12}\cmidrule(lr){13-14}
Dataset & \# Cells & F/M &avg.&med.&std.&avg.&med.&std.&\# Anatomy &\# Donors&\# Compartments&10X&smartseq2\\
    \hline\hline
germ line & 11 & M & 1.4e+04 & 1.2e+04 & 7.4e+03 & 2.8e+03 & 2.8e+03 & 1.1e+03 & 0 & 1 & 1 & \checkmark &  \\
stromal & 82478 & 0.84 & 6.9e+04 & 1e+04 & 3.5e+05 & 3.1e+03 & 2.8e+03 & 1.5e+03 & 0 & 13 & 1 & \checkmark & \checkmark \\
Large Intestine & 13680 & 0.2 & 2.4e+04 & 6e+03 & 1.1e+05 & 1.8e+03 & 1.5e+03 & 9.8e+02 & 5 & 2 & 4 & \checkmark & \checkmark \\
Lymph Node & 53275 & 0.75 & 2.5e+04 & 5.9e+03 & 1.2e+05 & 2e+03 & 1.8e+03 & 8.1e+02 & 5 & 3 & 3 & \checkmark & \checkmark \\
Skin & 9424 & M & 1.2e+05 & 7.4e+03 & 5.3e+05 & 2.2e+03 & 2e+03 & 9.7e+02 & 0 & 2 & 4 & \checkmark & \checkmark \\
epithelial & 104

  df_results = df_results.applymap(format_number)


## Categories Tables

In [33]:
# Anatomy table
df_results = pd.DataFrame(data)
df_results = df_results.drop(['num_cells', 'female_to_male_ratio', 'num_cells', 'female_to_male_ratio', 'umi_mean',
       'umi_median', 'umi_std', 'n_genes_mean', 'n_genes_median',
       'n_genes_std', 'compartment', 'methods', 'donor'], axis=1)
df_results.columns = ['Dataset', 'Anatomy']
df_results['anatomy list'] = df_results['Anatomy'].apply(lambda x: x.split(','))
df_results['Anatomy'] = df_results['anatomy list'].apply(lambda x: ', '.join(x))
df_results = df_results.drop(['anatomy list'], axis=1)
latex = df_results.to_latex(index=False)
latex = latex.replace('nan, ', '').replace('nan,', '').replace('nan', '').replace('_', '').replace('\midrule', '\hline\hline').replace(r'\begin{tabular}{ll}', r'\begin{tabular}{l|p{0.8\linewidth}}')

print(latex)


\begin{tabular}{l|p{0.8\linewidth}}
\toprule
Dataset & Anatomy \\
\hline\hline
germ line &  \\
stromal &  Supradiaphagmatic,  Parotid,  Posterior,  Endometrium,  Myometrium,  MAT,  SCAT,  Atria,  Ventricle,  exocrine,  abdomen,  chest,  atria,  ventricle,  noCornea,  lacrimalgland,  diaphragm,  rectusabdominus,  whole,  anterior,  posterior,  Sublingual,  Cornea-etc,  Neuroretina-etc,  Sclera-etc,  Distal,  Proximal,  Abdomen,  Chest,  Anterior,  Diaphragm,  AortaVeneCava,  CoronaryArteries,  distal,  proximal,  MedialDistal ,  proxmedialdistal,  Neuron,  Endocrine,  Exocrine,  Aorta,  aorta \\
LargeIntestine & Distal,  Proximal,  distal,  proximal,   \\
LymphNode & Inguinal,  Supradiaphagmatic,  inguinal,  supradiaphragmatic,   \\
Skin &  abdomen,  chest,  Abdomen,  Chest \\
epithelial &  Parotid,  Anterior,  Posterior,  Endometrium,  Myometrium,  Conjunctiva,  Atria,  Ventricle,  exocrine,  abdomen,  chest,  atria,  ventricle,  noCornea,  lacrimalgland,  whole,  anterior,  posterior,

In [32]:
# Compartment table
df_results = pd.DataFrame(data)
df_results = df_results.drop(['num_cells', 'female_to_male_ratio', 'num_cells', 'female_to_male_ratio', 'umi_mean',
       'umi_median', 'umi_std', 'n_genes_mean', 'n_genes_median',
       'n_genes_std', 'anatomy', 'methods', 'donor'], axis=1)
df_results.columns = ['Dataset', 'Compartment']
df_results['Compartment list'] = df_results['Compartment'].apply(lambda x: x.split(','))
df_results['Compartment'] = df_results['Compartment list'].apply(lambda x: ', '.join(x))
df_results = df_results.drop(['Compartment list'], axis=1)
latex = df_results.to_latex(index=False)
latex = latex.replace('nan, ', '').replace('nan,', '').replace('nan', '').replace('_', '').replace('\midrule', '\hline\hline').replace(r'\begin{tabular}{ll}', r'\begin{tabular}{l|l}')

print(latex)


\begin{tabular}{l|p{0.8\linewidth}
\toprule
Dataset & Compartment \\
\hline\hline
germ line & germ line \\
stromal & stromal \\
LargeIntestine & immune,  epithelial,  stromal,  endothelial \\
LymphNode & immune,  endothelial,  stromal \\
Skin & immune,  stromal,  endothelial,  epithelial \\
epithelial & epithelial \\
Spleen & immune,  endothelial \\
Vasculature & stromal,  immune,  endothelial,  epithelial \\
Kidney & epithelial,  immune,  endothelial \\
SalivaryGland & epithelial,  immune,  stromal,  endothelial \\
Thymus & immune,  epithelial,  stromal,  endothelial \\
Pancreas & epithelial,  immune,  endothelial,  stromal \\
Fat & stromal,  endothelial,  immune \\
Bladder & immune,  stromal,  epithelial,  endothelial \\
endothelial & endothelial \\
Blood & immune \\
Trachea & immune,  epithelial,  stromal,  endothelial \\
BoneMarrow & immune \\
Tongue & epithelial,  immune,  stromal,  endothelial \\
Lung & epithelial,  immune,  endothelial,  stromal \\
Liver & immune,  endothelial, 

In [35]:
# Donor table
df_results = pd.DataFrame(data)
df_results = df_results.drop(['num_cells', 'female_to_male_ratio', 'num_cells', 'female_to_male_ratio', 'umi_mean',
       'umi_median', 'umi_std', 'n_genes_mean', 'n_genes_median',
       'n_genes_std', 'anatomy', 'methods', 'compartment'], axis=1)
df_results.columns = ['Dataset', 'Donor']
df_results['Donor list'] = df_results['Donor'].apply(lambda x: x.split(','))
df_results['Donor'] = df_results['Donor list'].apply(lambda x: ', '.join(x))
df_results = df_results.drop(['Donor list'], axis=1)
latex = df_results.to_latex(index=False)
latex = latex.replace('nan, ', '').replace('nan,', '').replace('nan', '').replace('_', '').replace('\midrule', '\hline\hline').replace(r'\begin{tabular}{ll}', r'\begin{tabular}{l|l}')

print(latex)


\begin{tabular}{l|l}
\toprule
Dataset & Donor \\
\hline\hline
germ line & TSP14 \\
stromal & TSP6,  TSP7,  TSP4,  TSP5,  TSP3,  TSP10,  TSP12,  TSP9,  TSP8,  TSP14,  TSP15,  TSP1,  TSP2 \\
LargeIntestine & TSP14,  TSP2 \\
LymphNode & TSP7,  TSP14,  TSP2 \\
Skin & TSP10,  TSP14 \\
epithelial & TSP6,  TSP7,  TSP4,  TSP5,  TSP3,  TSP10,  TSP12,  TSP9,  TSP8,  TSP14,  TSP15,  TSP1,  TSP2 \\
Spleen & TSP7,  TSP14,  TSP2 \\
Vasculature & TSP14,  TSP2 \\
Kidney & TSP2 \\
SalivaryGland & TSP7,  TSP14 \\
Thymus & TSP14,  TSP2 \\
Pancreas & TSP9,  TSP1 \\
Fat & TSP10,  TSP14 \\
Bladder & TSP14,  TSP1,  TSP2 \\
endothelial & TSP6,  TSP7,  TSP4,  TSP5,  TSP3,  TSP10,  TSP12,  TSP9,  TSP8,  TSP14,  TSP15,  TSP1,  TSP2 \\
Blood & TSP7,  TSP10,  TSP8,  TSP14,  TSP1,  TSP2 \\
Trachea & TSP6,  TSP2 \\
BoneMarrow & TSP11,  TSP13,  TSP14,  TSP2 \\
Tongue & TSP7,  TSP4,  TSP14 \\
Lung & TSP14,  TSP1,  TSP2 \\
Liver & TSP6,  TSP14 \\
SmallIntestine & TSP14,  TSP2 \\
immune & TSP6,  TSP7,  TSP4,  TSP5,  TSP

In [None]:
df_results = pd.DataFrame(data)
df_results['anatomy list'] = df_results['anatomy'].apply(lambda x: x.split(','))
df_results['num. anatomy'] = df_results['anatomy list'].apply(lambda x: 0 if x[0] == 'nan' else len(x))
df_results = df_results.drop(['anatomy list', 'anatomy'], axis=1)
df_results['donor list'] = df_results['donor'].apply(lambda x: x.split(','))
df_results['num. donor'] = df_results['donor list'].apply(lambda x: 0 if x[0] == 'nan' else len(x))
df_results = df_results.drop(['donor list', 'donor'], axis=1)
df_results['compartment list'] = df_results['compartment'].apply(lambda x: x.split(','))
df_results['num. compartment'] = df_results['compartment list'].apply(lambda x: 0 if x[0] == 'nan' else len(x))
df_results = df_results.drop(['compartment list', 'compartment'], axis=1)
df_results['10X'] = df_results['methods'].apply(lambda x: '\checkmark' if '10X' in x else '')
df_results['smartseq2'] = df_results['methods'].apply(lambda x: '\checkmark' if 'smartseq2' in x else '')
df_results = df_results.drop(['methods'], axis=1)
df_results.columns = ['Dataset', '\# Cells', 'F/M', 'avg. UMI', 'med. UMI', 'std. UMI', 'avg. \# genes', 'med. \# genes', 'std. \# genes', '\# anatomy', '\# donors', '\# compartments', '10X', 'smartseq2']
df_results = df_results.applymap(format_number)

In [17]:
def format_number(x):
    if isinstance(x, (float, np.float64)):
        return f"{x:.2g}"
    return x

df_results = pd.DataFrame(data)
df_results.columns = ['Dataset', 'Number of Cells', 'F/M', 'avg. UMI', 'med. UMI', 'std. UMI', 'avg. num. genes', 'med. num. genes', 'std. num. genes', 'anatomy', 'compartment', 'assay', 'donor']
# df_results.columns = df_results.columns.str.replace('_', ' ')

# Apply number formatting
df_results = df_results.applymap(format_number)

# Convert DataFrame to LaTeX
latex_table = df_results.to_latex(index=False, escape=False, float_format="%.3f")

custom_table = latex_table.replace(
    r'\begin{tabular}{lrlllllllllll}',
    r'\begin{tabular}{l|rrrrrrrrrrrr}'
).replace(
    r'\toprule',
    r'\hline'
).replace(
    r'\midrule',
    r'\hline\hline'
).replace(
    '_', ' '
)

# Print the LaTeX table
print(custom_table)


\begin{tabular}{l|rrrrrrrrrrrr}
\hline
Dataset & Number of Cells & F/M & avg. UMI & med. UMI & std. UMI & avg. num. genes & med. num. genes & std. num. genes & anatomy & compartment & assay & donor \\
\hline\hline
germ line & 11 & M & 1.4e+04 & 1.2e+04 & 7.4e+03 & 2.8e+03 & 2.8e+03 & 1.1e+03 & nan & germ line & 10X & TSP14 \\
stromal & 82478 & 0.84 & 6.9e+04 & 1e+04 & 3.5e+05 & 3.1e+03 & 2.8e+03 & 1.5e+03 & nan, Supradiaphagmatic, Parotid, Posterior, Endometrium, Myometrium, MAT, SCAT, Atria, Ventricle, exocrine, abdomen, chest, atria, ventricle, noCornea, lacrimalgland, diaphragm, rectusabdominus, whole, anterior, posterior, Sublingual, Cornea-etc, Neuroretina-etc, Sclera-etc, Distal, Proximal, Abdomen, Chest, Anterior, Diaphragm, AortaVeneCava, CoronaryArteries, distal, proximal, MedialDistal , proxmedialdistal, Neuron, Endocrine, Exocrine, Aorta, aorta & stromal & 10X, smartseq2 & TSP6, TSP7, TSP4, TSP5, TSP3, TSP10, TSP12, TSP9, TSP8, TSP14, TSP15, TSP1, TSP2 \\
Large Intestine & 1

  df_results = df_results.applymap(format_number)


In [9]:
help(df_results.to_latex)

Help on method to_latex in module pandas.core.generic:

to_latex(buf: 'FilePath | WriteBuffer[str] | None' = None, columns: 'Sequence[Hashable] | None' = None, header: 'bool_t | list[str]' = True, index: 'bool_t' = True, na_rep: 'str' = 'NaN', formatters: 'FormattersType | None' = None, float_format: 'FloatFormatType | None' = None, sparsify: 'bool_t | None' = None, index_names: 'bool_t' = True, bold_rows: 'bool_t' = False, column_format: 'str | None' = None, longtable: 'bool_t | None' = None, escape: 'bool_t | None' = None, encoding: 'str | None' = None, decimal: 'str' = '.', multicolumn: 'bool_t | None' = None, multicolumn_format: 'str | None' = None, multirow: 'bool_t | None' = None, caption: 'str | tuple[str, str] | None' = None, label: 'str | None' = None, position: 'str | None' = None) -> 'str | None' method of pandas.core.frame.DataFrame instance
    Render object to a LaTeX tabular, longtable, or nested table.
    
    Requires ``\usepackage{{booktabs}}``.  The output can be co

In [52]:
list(gender_distribution.keys())

['male']

In [48]:
adata.obs

Unnamed: 0_level_0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ACTATTCAGGTCATCT_TSP14_Prostate_NA_10X_1_1,Prostate,10X,TSP14,,5104.0,1220,sperm,Sperm Cell,True,germ line,male
AGGTTGTCAACCCTAA_TSP14_Prostate_NA_10X_1_1,Prostate,10X,TSP14,,8547.0,2775,sperm,Sperm Cell,True,germ line,male
ATCCGTCGTGCTATTG_TSP14_Prostate_NA_10X_1_1,Prostate,10X,TSP14,,10919.0,2578,sperm,Sperm Cell,True,germ line,male
GAGCCTGAGCATGCAG_TSP14_Prostate_NA_10X_1_1,Prostate,10X,TSP14,,25324.0,4061,sperm,Sperm Cell,True,germ line,male
GGGACTCCAATGAGCG_TSP14_Prostate_NA_10X_1_1,Prostate,10X,TSP14,,19955.0,4062,sperm,Sperm Cell,True,germ line,male
TGACCCTTCATCGCCT_TSP14_Prostate_NA_10X_1_1,Prostate,10X,TSP14,,12453.0,2837,sperm,Sperm Cell,True,germ line,male
AACACACAGCGAACTG_TSP14_Prostate_NA_10X_1_2,Prostate,10X,TSP14,,7446.0,1819,sperm,Sperm Cell,True,germ line,male
AAGGAATGTATTCTCT_TSP14_Prostate_NA_10X_1_2,Prostate,10X,TSP14,,19495.0,3442,sperm,Sperm Cell,True,germ line,male
ACTATCTCATCCGAAT_TSP14_Prostate_NA_10X_1_2,Prostate,10X,TSP14,,11515.0,2667,sperm,Sperm Cell,True,germ line,male
CGGGTCAAGTATTAGG_TSP14_Prostate_NA_10X_1_2,Prostate,10X,TSP14,,6001.0,1424,sperm,Sperm Cell,True,germ line,male


In [17]:
dfData['Number of Cells'] = numCells
dfData

Unnamed: 0,Number of Cells
germ line,11
stromal,82478
Large_Intestine,13680
Lymph_Node,53275
Skin,9424
epithelial,104148
Spleen,34004
Vasculature,16037
Kidney,9641
Salivary_Gland,27199


In [45]:
# female to mail ratio
gender_distribution = adata.obs['gender'].value_counts()
fmr = gender_distribution.female / gender_distribution.male

# unique molecular identifiers
umiMean = adata.obs['n_counts_UMIs'].mean()
umiMedian = adata.obs['n_counts_UMIs'].median()
umiStd = adata.obs['n_counts_UMIs'].std()

# number of genes
nGenesMean = adata.obs['n_genes'].mean()
nGenesMedian = adata.obs['n_genes'].median()
nGenesStd = adata.obs['n_genes'].std()

# anatomical information
anatomy = adata.obs['anatomical_information'].unique()

# compartment
compartment = adata.obs['compartment'].unique()

# methods
methods = adata.obs['method'].unique()

# dataset
ds = adata.obs['donor'].unique()


In [44]:
adata.obs['compartment'].unique()

['endothelial', 'stromal', 'immune']
Categories (3, object): ['endothelial', 'immune', 'stromal']

In [31]:
adata.obs

Unnamed: 0_level_0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Empty_J24_L003,Muscle,smartseq2,TSP4,diaphragm,82831.0,2253,endothelial cell of vascular tree,endothelial cell of vascular tree,True,endothelial,female
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Endothelial_B12_L003,Muscle,smartseq2,TSP4,diaphragm,3531175.0,3896,endothelial cell of vascular tree,endothelial cell of vascular tree,True,endothelial,female
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Endothelial_B13_L003,Muscle,smartseq2,TSP4,diaphragm,1332680.0,1487,capillary endothelial cell,capillary endothelial cell,True,endothelial,female
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Endothelial_B14_L003,Muscle,smartseq2,TSP4,diaphragm,37120.0,502,endothelial cell of lymphatic vessel,endothelial cell of lymphatic vessel,True,endothelial,female
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Endothelial_B15_L003,Muscle,smartseq2,TSP4,diaphragm,2391365.0,3262,endothelial cell of lymphatic vessel,endothelial cell of lymphatic vessel,True,endothelial,female
...,...,...,...,...,...,...,...,...,...,...,...
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_O6_S342,Muscle,smartseq2,TSP2,rectusabdominus,410010.0,1669,mesenchymal stem cell,mesenchymal stem cell,True,stromal,female
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_P18_S378,Muscle,smartseq2,TSP2,rectusabdominus,1547776.0,2212,mesenchymal stem cell,mesenchymal stem cell,True,stromal,female
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_P21_S381,Muscle,smartseq2,TSP2,rectusabdominus,99507.0,899,skeletal muscle satellite stem cell,skeletal muscle satellite stem cell,True,stromal,female
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_P7_S367,Muscle,smartseq2,TSP2,rectusabdominus,200210.0,726,skeletal muscle satellite stem cell,skeletal muscle satellite stem cell,True,stromal,female


In [46]:
adata.var

Unnamed: 0,gene_symbol,feature_type,ensemblid,highly_variable,means,dispersions,dispersions_norm,mean,std
DDX11L1,DDX11L1,Gene Expression,ENSG00000223972.5,False,6.398244e-05,0.835044,-0.573947,0.000039,0.005574
WASH7P,WASH7P,Gene Expression,ENSG00000227232.5,False,2.274395e-03,2.442280,0.533203,0.001080,0.031731
MIR6859-1,MIR6859-1,Gene Expression,ENSG00000278267.1,False,6.175251e-05,1.295335,-0.256874,0.000033,0.005634
MIR1302-2HG,MIR1302-2HG,Gene Expression,ENSG00000243485.5,False,1.372886e-04,2.656352,0.680668,0.000048,0.008041
MIR1302-2,MIR1302-2,Gene Expression,ENSG00000284332.1,False,1.000000e-12,,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...
MT-ND6,MT-ND6,Gene Expression,ENSG00000198695.2,False,9.634841e-01,2.466404,0.154140,0.590065,0.741395
MT-TE,MT-TE,Gene Expression,ENSG00000210194.1,False,1.600667e-01,1.603787,-0.044396,0.083929,0.301820
MT-CYB,MT-CYB,Gene Expression,ENSG00000198727.2,False,4.367693e+00,4.765751,-0.499747,3.874830,1.104192
MT-TT,MT-TT,Gene Expression,ENSG00000210195.2,False,6.573967e-02,0.624316,-0.719108,0.040580,0.186848


In [24]:
adata.obs['gender']  # ['compartment']

cell_id
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Empty_J24_L003                   female
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Endothelial_B12_L003             female
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Endothelial_B13_L003             female
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Endothelial_B14_L003             female
TSP4_Muscle_diaphragm_SS2_B112813_B134043_Endothelial_B15_L003             female
                                                                            ...  
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_O6_S342     female
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_P18_S378    female
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_P21_S381    female
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_P7_S367     female
TSP2_Muscle_rectusabdominus_SS2_B114661_B133087_MuscleStemCell_P8_S368     female
Name: gender, Length: 30746, dtype: category
Categories (2, object): ['female', 'male']

In [15]:
dfRep['Reference'] = ['','','','']
dfRep['Dimension'] = ['', 512, '', '']
dfRep['Input']     = ['NA', 'Rank Order', 'Rank Order', 'Real Value']
dfRep