# Aim of the notebook

Importing a dataset to use for simulated CNVs to test technical limitations 

Import packages

In [2]:
import cellxgene_census
import seaborn as sns
import pandas as pd
import numpy as np
import scanpy as sc
import os
import matplotlib.pyplot as plt

Extract cell metadata from census [2024-07-01]

In [3]:
with cellxgene_census.open_soma() as census:
    # Reads SOMADataFrame as a slice
    cell_metadata_master = census["census_data"]["homo_sapiens"].obs.read(
        value_filter = "disease == 'normal'",
        column_names = ["assay", "cell_type", "tissue", "tissue_general", "donor_id", "disease","dataset_id"])
    # Concatenates results to pyarrow.Table
    cell_metadata_master = cell_metadata_master.concat()
    # Converts to pandas.DataFrame
    cell_metadata_master = cell_metadata_master.to_pandas()

The "stable" release is currently 2024-07-01. Specify 'census_version="2024-07-01"' in future calls to open_soma() to ensure data consistency.


In [5]:
# Filter for specific criteria
cell_metadata_filtered = cell_metadata_master[
    (cell_metadata_master['assay'] == "10x 3' v3") &
    (cell_metadata_master['disease'] == 'normal')
]

In [9]:
set(cell_metadata_filtered.cell_type.unique())

{'A2 amacrine cell',
 'B cell',
 'BEST4+ intestinal epithelial cell, human',
 'Bergmann glial cell',
 'CD14-low, CD16-positive monocyte',
 'CD14-positive monocyte',
 'CD14-positive, CD16-positive monocyte',
 'CD141-positive myeloid dendritic cell',
 'CD16-negative, CD56-bright natural killer cell, human',
 'CD16-positive, CD56-dim natural killer cell, human',
 'CD1c-positive myeloid dendritic cell',
 'CD4-positive helper T cell',
 'CD4-positive, alpha-beta T cell',
 'CD4-positive, alpha-beta cytotoxic T cell',
 'CD4-positive, alpha-beta memory T cell',
 'CD8-alpha-alpha-positive, alpha-beta intraepithelial T cell',
 'CD8-positive, alpha-beta T cell',
 'CD8-positive, alpha-beta cytokine secreting effector T cell',
 'CD8-positive, alpha-beta cytotoxic T cell',
 'CD8-positive, alpha-beta memory T cell',
 'CD8-positive, alpha-beta memory T cell, CD45RO-positive',
 'DN1 thymic pro-T cell',
 'DN3 thymocyte',
 'DN4 thymocyte',
 'GABAergic amacrine cell',
 'GABAergic interneuron',
 'GABAergic 

In [10]:
# Function to check one unique donor_id and at least two distinct cell types
def filter_by_donor_and_celltype(df):
    return (df['donor_id'].nunique() == 1) and (df['cell_type'].nunique() >= 2)

# Apply group filtering by dataset_id
final_datasets = cell_metadata_filtered.groupby('dataset_id').filter(filter_by_donor_and_celltype)


  final_datasets = cell_metadata_filtered.groupby('dataset_id').filter(filter_by_donor_and_celltype)


In [11]:
set(final_datasets.donor_id.unique())

{'H11',
 'H18.30.002',
 'H19.30.001',
 'H19.30.002',
 'TSP12',
 'TSP2',
 'TSP4',
 'healthy_6',
 'manaab_H71',
 'pooled'}

Extract only the cells from the latest 10x 3' v3 chromium assay

In [5]:
cell_metadata=cell_metadata[cell_metadata['assay']=="10x 3' v3"]

To avoid batch effect handlings, select cells from one dataset

Overview

In [7]:
ctb=pd.crosstab(cell_metadata['dataset_id'],cell_metadata['donor_id'])
mtd=cell_metadata.groupby(['dataset_id','donor_id']).count()
mtdsub=mtd[mtd['assay']>8000]
mtdsub=mtdsub[mtdsub['assay']<20000]
sel_dataset=[e[0] for e in mtdsub.index]
sel_donor=[e[1] for e in mtdsub.index]
cell_metadata=cell_metadata[cell_metadata['donor_id'].isin(sel_donor)]
cell_metadata=cell_metadata[cell_metadata['dataset_id'].isin(sel_dataset)]

  mtd=cell_metadata.groupby(['dataset_id','donor_id']).count()


In [17]:
len(cell_metadata['dataset_id'].unique())

237

In [23]:
cell_metadata.shape

(20172405, 7)

In [24]:
ctct=pd.crosstab(cell_metadata['dataset_id'],cell_metadata['cell_type'])

In [25]:
ctct

cell_type,B cell,Bergmann glial cell,"CD14-low, CD16-positive monocyte",CD14-positive monocyte,"CD14-positive, CD16-negative classical monocyte","CD14-positive, CD16-positive monocyte",CD141-positive myeloid dendritic cell,"CD16-negative, CD56-bright natural killer cell, human",CD1c-positive myeloid dendritic cell,CD4-positive helper T cell,...,type I pneumocyte,type II pneumocyte,type L enteroendocrine cell,unknown,unswitched memory B cell,uterine smooth muscle cell,vascular associated smooth muscle cell,vascular leptomeningeal cell,vein endothelial cell,vip GABAergic cortical interneuron
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00476f9f-ebc1-4b72-b541-32f912ce36ea,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,11,0,0,0
00e5dedd-b9b7-43be-8c28-b0e5c6414a62,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0129dbd9-a7d3-4f6b-96b9-1da155a93748,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0325478a-9b52-45b5-b40a-2e2ab0d72eb1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03c544fb-a103-4d18-9230-eae9cfee3af2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fbf173f9-f809-4d84-9b65-ae205d35b523,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,0,0
fe1a73ab-a203-45fd-84e9-0f7fd19efcbd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,0
fe4b89d5-461e-440c-a5a8-621b37b122c0,0,0,0,0,0,0,0,0,0,0,...,0,0,90,0,0,0,0,0,0,0
fe52003e-1460-4a65-a213-2bb1a508332f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2512,0,0,0,0,0,0


In [26]:
#We filter for datasets with at least two cell types
ctct=ctct[np.sum(ctct>10,axis=1)>2]
# and  maximum 30 cell types
ctct=ctct[np.sum(ctct>10,axis=1)<30]
# we get the metadata for this cells
cell_metadata=cell_metadata[cell_metadata['dataset_id'].isin(ctct.index)]

In [27]:
cell_metadata

Unnamed: 0,assay,cell_type,tissue,tissue_general,donor_id,disease,dataset_id
402735,10x 3' v3,retinal rod cell,macula lutea,eye,donor_2,normal,856c1b98-5727-49da-bf0f-151bdb8cb056
402736,10x 3' v3,retinal rod cell,macula lutea,eye,donor_2,normal,856c1b98-5727-49da-bf0f-151bdb8cb056
402737,10x 3' v3,amacrine cell,macula lutea,eye,donor_2,normal,856c1b98-5727-49da-bf0f-151bdb8cb056
402738,10x 3' v3,macroglial cell,macula lutea,eye,donor_2,normal,856c1b98-5727-49da-bf0f-151bdb8cb056
402739,10x 3' v3,retinal bipolar neuron,macula lutea,eye,donor_2,normal,856c1b98-5727-49da-bf0f-151bdb8cb056
...,...,...,...,...,...,...,...
74322505,10x 3' v3,pvalb GABAergic cortical interneuron,dorsolateral prefrontal cortex,brain,H21.33.044,dementia,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
74322506,10x 3' v3,vip GABAergic cortical interneuron,dorsolateral prefrontal cortex,brain,H20.33.018,dementia,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
74322507,10x 3' v3,L2/3-6 intratelencephalic projecting glutamate...,dorsolateral prefrontal cortex,brain,H21.33.003,normal,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
74322508,10x 3' v3,astrocyte of the cerebral cortex,dorsolateral prefrontal cortex,brain,H21.33.019,normal,6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3


In [29]:
cell_metadata.shape

(15818764, 7)

We now aim to select only one of the donors of the experiment with higher number of cells

In [30]:
dataset_id_used=[]
dataset_id_selected=[]
donor_id_selected=[]

ctb=pd.crosstab(cell_metadata['dataset_id'],cell_metadata['donor_id'])
mtd=cell_metadata.groupby(['dataset_id','donor_id']).count()

for ind in mtd.index:
    if ind[0] not in dataset_id_used:
        dataset_id_used.append(ind[0])
        dataset_id_selected.append(ind[0])
        donor_id_selected.append(ind[1])

datasets_to_use=pd.DataFrame([dataset_id_selected,donor_id_selected],index=['dataset_id','donor_id']).transpose()

  mtd=cell_metadata.groupby(['dataset_id','donor_id']).count()


In [31]:
datasets_to_use

Unnamed: 0,dataset_id,donor_id
0,0041b9c3-6a49-4bf7-8514-9bc7190067a7,#105
1,00476f9f-ebc1-4b72-b541-32f912ce36ea,#105
2,00e5dedd-b9b7-43be-8c28-b0e5c6414a62,#105
3,00ff600e-6e2e-4d76-846f-0eec4f0ae417,#105
4,01209dce-3575-4bed-b1df-129f57fbc031,#105
...,...,...
673,fe1a73ab-a203-45fd-84e9-0f7fd19efcbd,#105
674,fe4b89d5-461e-440c-a5a8-621b37b122c0,#105
675,fe52003e-1460-4a65-a213-2bb1a508332f,#105
676,ff45e623-7f5f-46e3-b47d-56be0341f66b,#105


In [55]:
cell_metadata['dataset_id']

402735      856c1b98-5727-49da-bf0f-151bdb8cb056
402736      856c1b98-5727-49da-bf0f-151bdb8cb056
402737      856c1b98-5727-49da-bf0f-151bdb8cb056
402738      856c1b98-5727-49da-bf0f-151bdb8cb056
402739      856c1b98-5727-49da-bf0f-151bdb8cb056
                            ...                 
74322505    6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
74322506    6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
74322507    6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
74322508    6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
74322509    6f7fd0f1-a2ed-4ff1-80d3-33dde731cbc3
Name: dataset_id, Length: 15818764, dtype: category
Categories (678, object): ['0041b9c3-6a49-4bf7-8514-9bc7190067a7', '00476f9f-ebc1-4b72-b541-32f912ce36ea', '00e5dedd-b9b7-43be-8c28-b0e5c6414a62', '00ff600e-6e2e-4d76-846f-0eec4f0ae417', ..., 'fe4b89d5-461e-440c-a5a8-621b37b122c0', 'fe52003e-1460-4a65-a213-2bb1a508332f', 'ff45e623-7f5f-46e3-b47d-56be0341f66b', 'ff7d15fa-f4b6-4a0e-992e-fd0c9d088ded']

In [36]:
len(datasets_to_use['dataset_id'].unique())

678

In [62]:
cell_metadata[cell_metadata['donor_id'] == '#105']

Unnamed: 0,assay,cell_type,tissue,tissue_general,donor_id,disease,dataset_id
14121878,10x 3' v3,cerebellar granule cell,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd
14121879,10x 3' v3,cerebellar granule cell,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd
14121880,10x 3' v3,cerebellar granule cell,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd
14121881,10x 3' v3,Bergmann glial cell,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd
14121882,10x 3' v3,cerebellar granule cell,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd
...,...,...,...,...,...,...,...
14163130,10x 3' v3,oligodendrocyte,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd
14163131,10x 3' v3,unknown,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd
14163132,10x 3' v3,macroglial cell,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd
14163133,10x 3' v3,macroglial cell,hemisphere part of cerebellar posterior lobe,brain,#105,normal,bab7432a-5cfe-45ea-928c-422d03c45cdd


In [61]:
for cell_type in cell_metadata[cell_metadata['donor_id'] == '#105']['cell_type'].unique():
    print(cell_type)
    print(cell_metadata[cell_metadata['cell_type'] == cell_type].shape[0])

cerebellar granule cell
31221
Bergmann glial cell
9082
interneuron
903
unknown
117286
oligodendrocyte
802360
immature astrocyte
2
differentiation-committed oligodendrocyte precursor
571
meningeal macrophage
20
brain vascular cell
4490
T cell
92838
macroglial cell
5887
leukocyte
31793
microglial cell
165816


In [None]:
'oligodendrocyte'
'T cell'


In [75]:
import cellxgene_census

with cellxgene_census.open_soma() as census:
    adata = cellxgene_census.get_anndata(
        census = census,
        organism = "Homo sapiens",
        var_value_filter = "feature_id in ['ENSG00000161798', 'ENSG00000188229']",
        obs_value_filter = "donor_id == '#105' and dataset_id == 'bab7432a-5cfe-45ea-928c-422d03c45cdd' and cell_type in ['T cell', 'oligodendrocyte']",
        column_names = {"obs": ["assay", "cell_type", "tissue", "tissue_general", "disease"]},
    )

    print(adata)

The "stable" release is currently 2024-07-01. Specify 'census_version="2024-07-01"' in future calls to open_soma() to ensure data consistency.
  adata = cellxgene_census.get_anndata(


AnnData object with n_obs × n_vars = 357 × 2
    obs: 'assay', 'cell_type', 'tissue', 'tissue_general', 'disease', 'donor_id', 'dataset_id'
    var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'


In [79]:
adata.obs.cell_type.unique()

['oligodendrocyte', 'T cell']
Categories (698, object): ['A2 amacrine cell', 'B cell', 'B-1 B cell', 'B-1a B cell', ..., 'vein endothelial cell', 'ventricular cardiac muscle cell', 'vip GABAergic cortical interneuron', 'visceromotor neuron']

In [80]:
adata.var

Unnamed: 0,soma_joinid,feature_id,feature_name,feature_length,nnz,n_measured_obs
0,10507,ENSG00000161798,AQP5,1884,1226640,68915280
1,16091,ENSG00000188229,TUBB4B,2037,26463689,73806975


In [None]:
i

In [91]:
census = cellxgene_census.get_census_version("latest")

AttributeError: module 'cellxgene_census' has no attribute 'get_census_version'

In [102]:
census.metadata()  # adjust based on actual dataset fetching method

TypeError: 'MetadataWrapper' object is not callable

In [100]:
dir(census)

['_MutableMapping__marker',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__contains__',
 '__del__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_add_new_element',
 '_add_new_ndarray',
 '_check_allows_child',
 '_check_open_read',
 '_close_stack',
 '_contents',
 '_contents_lines',
 '_del_element',
 '_handle',
 '_mutated_keys',
 '_my_repr',
 '_new_child_uri',
 '_set_element',
 '_subclass_constrained_soma_types',
 '_wrapper_type',
 'add_new_collection',
 'add_ne

In [None]:
# Filter for specific criteria
filtered_datasets = datasets[
    (datasets['assay'] == 'v3') &
    (datasets['disease'] == 'normal')
]

# Function to check one unique donor_id and at least two distinct cell types
def filter_by_donor_and_celltype(df):
    return (df['donor_id'].nunique() == 1) and (df['cell_type'].nunique() >= 2)

# Apply group filtering by dataset_id
final_datasets = filtered_datasets.groupby('dataset_id').filter(filter_by_donor_and_celltype)

# Display the result
print(final_datasets[['dataset_id', 'donor_id', 'cell_type']])

In [98]:
census = cellxgene_census.open_soma() 

The "stable" release is currently 2024-07-01. Specify 'census_version="2024-07-01"' in future calls to open_soma() to ensure data consistency.


In [96]:
dir(cellxgene_census.open_soma)

['__annotations__',
 '__builtins__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__getstate__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__type_params__']