# TS Fibroblast Data Engineering

**This notebook builds one `.h5ad` file containing all of the Fibroblast data found in the TS database.**

Auth: [Joshua Pickard](jpic@umich.edu)

Date: August 29, 2024

In [1]:
import scanpy as sp
import numpy as np
import pandas as pd
import scanpy as sp
import os

In [2]:
DATAPATH = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/extract/"
FILE = "TabulaSapiens.h5ad"

In [3]:
adata = sp.read_h5ad(os.path.join(DATAPATH, FILE))

In [4]:
adata.shape

(483152, 58870)

# Fibroblasts

## Build the new File

In [16]:
# Find all unique entries containing 'fib' (case-insensitive) in 'cell_ontology_class'
fib_entries_1 = [entry for entry in adata.obs['cell_ontology_class'].unique() if 'fib' in entry.lower()]

# Find all unique entries containing 'fib' (case-insensitive) in 'free_annotation'
fib_entries_2 = [entry for entry in adata.obs['free_annotation'].unique() if 'fib' in entry.lower()]

# Reduce data to only include rows where either 'cell_ontology_class' or 'free_annotation' is in their respective fib entries
fibroblast_adata = adata[
    adata.obs['cell_ontology_class'].isin(fib_entries_1) | 
    adata.obs['free_annotation'].isin(fib_entries_2)
].copy()

print(fibroblast_adata)


AnnData object with n_obs × n_vars = 38151 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'


In [17]:
# Define the output directory and filename
output_dir = "/nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/jpic"
output_file = "fibroblast.h5ad"

# Write the filtered AnnData object to a file
fibroblast_adata.write(os.path.join(output_dir, output_file))

print(f"File saved to: {os.path.join(output_dir, output_file)}")


File saved to: /nfs/turbo/umms-indikar/shared/projects/DGC/data/tabula_sapiens/jpic/fibroblast.h5ad


# Neurons

## Build New File

In [18]:
# Find all unique entries containing 'fib' (case-insensitive) in 'cell_ontology_class'
neur_entries_1 = [entry for entry in adata.obs['cell_ontology_class'].unique() if 'neur' in entry.lower()]

# Find all unique entries containing 'fib' (case-insensitive) in 'free_annotation'
neur_entries_2 = [entry for entry in adata.obs['free_annotation'].unique() if 'neur' in entry.lower()]

# Reduce data to only include rows where either 'cell_ontology_class' or 'free_annotation' is in their respective fib entries
neuron_adata = adata[
    adata.obs['cell_ontology_class'].isin(neur_entries_1) | 
    adata.obs['free_annotation'].isin(neur_entries_2)
].copy()

print(neuron_adata)


AnnData object with n_obs × n_vars = 34 × 58870
    obs: 'organ_tissue', 'method', 'donor', 'anatomical_information', 'n_counts_UMIs', 'n_genes', 'cell_ontology_class', 'free_annotation', 'manually_annotated', 'compartment', 'gender'
    var: 'gene_symbol', 'feature_type', 'ensemblid', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: '_scvi', '_training_mode', 'dendrogram_cell_type_tissue', 'dendrogram_computational_compartment_assignment', 'dendrogram_consensus_prediction', 'dendrogram_tissue_cell_type', 'donor_colors', 'donor_method_colors', 'hvg', 'method_colors', 'neighbors', 'organ_tissue_colors', 'sex_colors', 'tissue_colors', 'umap'
    obsm: 'X_pca', 'X_scvi', 'X_scvi_umap', 'X_umap'
    layers: 'decontXcounts', 'raw_counts'
    obsp: 'connectivities', 'distances'


In [19]:
neur_entries_1

['retinal bipolar neuron']

In [20]:
neur_entries_2

[]

# Osteoblasts

In [None]:
# Find all unique entries containing 'fib' (case-insensitive) in 'cell_ontology_class'
neur_entries_1 = [entry for entry in adata.obs['cell_ontology_class'].unique() if 'osteoblasts' in entry.lower()]

# Find all unique entries containing 'fib' (case-insensitive) in 'free_annotation'
entries_2 = [entry for entry in adata.obs['free_annotation'].unique() if 'osteoblasts' in entry.lower()]

# Reduce data to only include rows where either 'cell_ontology_class' or 'free_annotation' is in their respective fib entries
neuron_adata = adata[
    adata.obs['cell_ontology_class'].isin(neur_entries_1) | 
    adata.obs['free_annotation'].isin(neur_entries_2)
].copy()

print(neuron_adata)


In [25]:
list(adata.obs['organ_tissue'].unique())

['Liver',
 'Trachea',
 'Blood',
 'Lymph_Node',
 'Salivary_Gland',
 'Spleen',
 'Tongue',
 'Mammary',
 'Uterus',
 'Eye',
 'Fat',
 'Skin',
 'Bone_Marrow',
 'Heart',
 'Pancreas',
 'Prostate',
 'Muscle',
 'Thymus',
 'Bladder',
 'Large_Intestine',
 'Lung',
 'Small_Intestine',
 'Vasculature',
 'Kidney']

## Scratch

In [7]:
adata.obs['cell_ontology_class'].unique()

['macrophage', 'monocyte', 'endothelial cell of hepatic sinusoid', 'liver dendritic cell', 'nk cell', ..., 'pancreatic delta cell', 'duodenum glandular cell', 'immature natural killer cell', 'thymocyte', 'connective tissue cell']
Length: 177
Categories (177, object): ['acinar cell of salivary gland', 'adipocyte', 'adventitial cell', 'alveolar fibroblast', ..., 'type i pneumocyte', 'type ii pneumocyte', 'vascular associated smooth muscle cell', 'vein endothelial cell']

In [12]:
# Find all unique entries in 'cell_ontology_class' that contain 'fib' (case-insensitive)
fib_entries = [entry for entry in adata.obs['cell_ontology_class'].unique() if 'fib' in entry.lower()]

print(fib_entries)


['fibroblast', 'fibroblast of breast', 'myofibroblast cell', 'fibroblast of cardiac tissue', 'alveolar fibroblast']


In [13]:
# Find all unique entries in 'cell_ontology_class' that contain 'fib' (case-insensitive)
fib_entries = [entry for entry in adata.obs['cell_ontology_class'].unique() if 'fib' in entry.lower()]

# Filter adata to include only rows where 'cell_ontology_class' is in fib_entries
fibroblast_adata = adata[adata.obs['cell_ontology_class'].isin(fib_entries)].copy()

fibroblast_adata.shape


(37709, 58870)

In [9]:
adata.obs

Unnamed: 0,organ_tissue,method,donor,anatomical_information,n_counts_UMIs,n_genes,cell_ontology_class,free_annotation,manually_annotated,compartment,gender
AAACCCACACTCCTGT_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,7633.0,2259,macrophage,Monocyte/Macrophage,True,immune,male
AAACGAAGTACCAGAG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,2858.0,1152,monocyte,Monocyte,True,immune,male
AAACGCTCAACGGCTC_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,7787.0,2983,endothelial cell of hepatic sinusoid,Endothelial,True,endothelial,male
AAAGAACAGCCTCTTC_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,10395.0,2598,macrophage,Monocyte/Macrophage,True,immune,male
AAAGAACGTAGCACAG_TSP6_Liver_NA_10X_1_1,Liver,10X,TSP6,,6610.0,2125,liver dendritic cell,Dendritic cell,True,immune,male
...,...,...,...,...,...,...,...,...,...,...,...
TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P4_S364,Vasculature,smartseq2,TSP2,aorta,13205.0,579,endothelial cell,endothelial cell,True,endothelial,female
TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P5_S365,Vasculature,smartseq2,TSP2,aorta,9565.0,529,endothelial cell,endothelial cell,True,endothelial,female
TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P7_S367,Vasculature,smartseq2,TSP2,aorta,195639.0,2753,endothelial cell,endothelial cell,True,endothelial,female
TSP2_Vasculature_aorta_SS2_B114577_B133059_Endothelial_P8_S368,Vasculature,smartseq2,TSP2,aorta,37260.0,984,endothelial cell,endothelial cell,True,endothelial,female


In [14]:
# Find all unique entries in 'cell_ontology_class' that contain 'fib' (case-insensitive)
fib_entries = [entry for entry in adata.obs['free_annotation'].unique() if 'fib' in entry.lower()]

print(fib_entries)


['Stellate/Fibroblast', 'fibroblast', 'Fibroblast', 'Adventitial fibroblast', 'Fibroblasts (Lipofibroblasts)', 'Uterine fibroblast', 'Endometrial stromal fibbroblast', 'fibroblasts', 'Fibroblasts', 'Myofibroblasts', 'Cardiac Fibroblast', 'Adventitial Fibroblast', 'myofibroblast cell', 'alveolar fibroblast']


In [15]:
# Find all unique entries in 'cell_ontology_class' that contain 'fib' (case-insensitive)
fib_entries = [entry for entry in adata.obs['free_annotation'].unique() if 'fib' in entry.lower()]

# Filter adata to include only rows where 'cell_ontology_class' is in fib_entries
fibroblast_adata = adata[adata.obs['free_annotation'].isin(fib_entries)].copy()

fibroblast_adata.shape


(38151, 58870)