# Pseudobulk the HLCA (Human lung cell atlas) data

In [None]:
import scanpy as sc
import pandas as pd

pd.options.display.max_columns = None

## Load the data

In [None]:
# downloaded from https://data.humancellatlas.org/hca-bio-networks/lung/atlases/lung-v1-0 (full atlas = core + extension)
ad = sc.read('hlca.h5ad') # 2,282,447 cells 
ad = ad.raw.to_adata()

## Filtering 

In [None]:
ad = ad[ad.obs.assay.str.startswith('10x') & (ad.obs.suspension_type == 'cell') & (ad.obs["3'_or_5'"] == "3'")].copy()

In [None]:
ad = ad[~ad.obs.ann_level_4.isin(['Unknown', 'None'])].copy()

In [None]:
ad = ad[ad.obs.ann_level_4.notna()].copy()

In [None]:
ad.obs['tissue'] = ad.obs.tissue.astype(str) + ['' if x == 'nan' else f'-{x}' for x in ad.obs.tissue_level_2.astype(str)]

In [None]:
ad.obs['donor_id'] = ad.obs.donor_id.astype(str) + '-' + ad.obs['sample'].astype(str)

In [None]:
ident_cols = ['donor_id', 'ann_level_4', 'dataset', 'lung_condition', 'tissue']

In [None]:
ad.obs = ad.obs[ident_cols].copy()

In [None]:
for c in ident_cols:
    ad.obs[c] = ad.obs[c].astype(str)

In [None]:
ad._sanitize()

## Now the pseudobulking

In [None]:
counts = ad.obs[ident_cols].value_counts().reset_index()
counts = counts.rename(columns={'count': 'n_cells'})

In [None]:
ad = ad[ad.obs.merge(counts, how='left').n_cells>=10].copy()

In [None]:
sc.pp.filter_genes(ad, min_cells=50)

In [None]:
ad.var['gene_id'] = ad.var.index
ad.var.index = ad.var.feature_name
ad.var.index.name = None

In [None]:
counts = ad.obs[ident_cols].value_counts().reset_index()
counts = counts.rename(columns={'count': 'n_cells'})

In [None]:
adp = sc.get.aggregate(ad, ident_cols, func='sum')

In [None]:
adp.X = adp.layers['sum'].astype(int)

In [None]:
del adp.layers['sum']

### Add cell counts

In [None]:
adp.obs = adp.obs.merge(counts, how='left')

In [None]:
adp.obs.rename(columns={'ann_level_4': 'celltype', 'lung_condition': 'condition'}, inplace=True)

## Save

In [None]:
adp = adp.copy()

In [None]:
adp.write('lung-pseudobulk.h5ad')