# Pseudobulk the Heart atlas data

In [None]:
import scanpy as sc
import pandas as pd

pd.options.display.max_columns = None

## Load the raw data

In [None]:
# downloaded from https://www.heartcellatlas.org/ # Heart Global, raw
ad = sc.read('Global_raw.h5ad')

In [None]:
ad.obs.region_finest = ad.obs.region_finest.astype(str)

In [None]:
ad.obs.loc[ad.obs.region_finest == 'na', 'region_finest'] = 'SAN_unknown'
ad.obs.loc[ad.obs.region_finest == 'IVS MID LV', 'region_finest'] = 'SP IVS MID LV'
ad.obs.loc[ad.obs.region_finest == 'IVS MID RV', 'region_finest'] = 'SP IVS MID RV'

In [None]:
ad._sanitize()

## Filtering 

In [None]:
ad = ad[(ad.obs.cell_or_nuclei == 'Nuclei') &  (ad.obs.cell_state!='unclassified')].copy()

In [None]:
ad.obs.cell_state = ad.obs.cell_type.astype(str) + '-' + ad.obs.cell_state.astype(str)

In [None]:
ident_cols = ['sample_ID', 'region_finest', 'cell_state', 'cell_type']
ad.obs = ad.obs[ident_cols].copy()

In [None]:
for c in ident_cols:
    ad.obs[c] = ad.obs[c].astype(str)

In [None]:
ad._sanitize()

## Now the pseudobulking

In [None]:
counts = ad.obs[ident_cols].value_counts().reset_index()
counts = counts.rename(columns={'count': 'n_cells'})

In [None]:
ad = ad[ad.obs.merge(counts, how='left').n_cells>=10].copy()

In [None]:
sc.pp.filter_genes(ad, min_cells=50)

In [None]:
ad.var['gene_id'] = ad.var.index
ad.var.index = ad.var['gene_name-new']
ad.var.index.name = None

In [None]:
counts = ad.obs[ident_cols].value_counts().reset_index()
counts = counts.rename(columns={'count': 'n_cells'})

In [None]:
adp = sc.get.aggregate(ad, ident_cols, func='sum')

In [None]:
adp.X = adp.layers['sum'].astype(int)

In [None]:
del adp.layers['sum']

### Add cell counts

In [None]:
adp.obs = adp.obs.merge(counts, how='left')

In [None]:
adp.obs.rename(columns={'region_finest': 'region'}, inplace=True)

In [None]:
adp = adp.copy()

## Save

In [None]:
adp.write('heart-pseudobulk.h5ad')