# Pseudobulk the Retina (HRCA) cell atlas data

In [1]:
import scanpy as sc
import pandas as pd
pd.options.display.max_columns = None

  from pandas.core.computation.check import NUMEXPR_INSTALLED


## Load data

In [2]:
ad = sc.read('cellxgene-retina-snRNA-seq.h5ad') # from https://cellxgene.cziscience.com/collections/4c6eaf5c-6d57-4c76-b1e9-60df8c655f1e sn all cells
ad = ad.raw.to_adata()

AnnData object with n_obs × n_vars = 1775529 × 60042
    obs: 'mapped_reference_assembly', 'mapped_reference_annotation', 'alignment_software', 'donor_id', 'donor_age', 'self_reported_ethnicity_ontology_term_id', 'donor_cause_of_death', 'donor_living_at_sample_collection', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_derivation_process', 'sample_source', 'tissue_type', 'suspension_derivation_process', 'suspension_dissociation_reagent', 'suspension_enriched_cell_types', 'suspension_enrichment_factors', 'suspension_uuid', 'suspension_type', 'tissue_handling_interval', 'library_uuid', 'assay_ontology_term_id', 'library_starting_quantity', 'sequencing_platform', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'majorclass', 'cluster2', 'AC_group1', 'BC_marker', 'BC_group1', '

In [13]:
ident_cols = ['cell_type', 'majorclass', 'donor_id', 'sex', 'donor_age', 'sample_uuid', 'tissue', 'sample_source', 'study_name', 'development_stage']

In [14]:
ad.obs = ad.obs[ident_cols]

## Filter

In [17]:
sc.pp.filter_genes(ad, min_cells=50)

AnnData object with n_obs × n_vars = 1775529 × 39871
    obs: 'cell_type', 'majorclass', 'donor_id', 'sex', 'donor_age', 'sample_uuid', 'tissue', 'sample_source', 'study_name', 'development_stage'
    var: 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'n_cells'
    uns: 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_scVI', 'X_umap'

In [19]:
ad.var.index = ad.var.feature_name

In [20]:
ad.var.index.name = None

In [21]:
ad.var = ad.var.iloc[:, 0:0]

AnnData expects .var.index to contain strings, but got values like:
    ['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'C1orf112']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


In [25]:
ad.obs.donor_id = ad.obs.donor_id.astype(str) + '-' + ad.obs.sex.astype(str) + '-' + ad.obs.donor_age.astype(str)

In [26]:
ad.obs.drop(columns=['sex', 'donor_age'], inplace=True)

In [28]:
ident_cols = ['cell_type', 'donor_id', 'sample_uuid', 'tissue', 'study_name']
ad.obs = ad.obs[ident_cols]

## Pseudobulk

In [31]:
adp = sc.get.aggregate(ad, ad.obs.columns, func='sum')

In [32]:
adp.X = adp.layers['sum']
del adp.layers['sum']

## Add cell counts

In [35]:
counts = ad.obs[ident_cols].value_counts().reset_index()
counts = counts.rename(columns={'count': 'n_cells'})

In [36]:
adp.obs = adp.obs.merge(counts, how='left')

In [38]:
adp = adp[adp.obs.n_cells>=10].copy()



AnnData object with n_obs × n_vars = 1560 × 39871
    obs: 'cell_type', 'donor_id', 'sample_uuid', 'tissue', 'study_name', 'n_cells'

## Save

In [39]:
adp.write('retina-snrna-seq-atlas.h5ad')