In [1]:
import scanpy as sc
import pandas as pd
import numpy as np

In [2]:
data_dir = "/projects/b1038/Pulmonary/cpuritz/PASC/data"

In [3]:
# Read in anndata
adata = sc.read_h5ad(f"{data_dir}/01NEP/01integrated_NEP_GEO_v2/01integrated_NEP_GEO_v2.h5ad")

### Count number of cells per cell type

In [5]:
# Create counts data frame
cols = ['cell_type', 'Status', 'Study_ID']
counts = adata.obs.groupby(cols, observed = True).count()['Library_ID'].reset_index()
counts.rename(columns = {'Library_ID': 'n_cells'}, inplace = True)
counts['cell_type'] = counts.cell_type.astype('category')

In [6]:
# Compute cell type proportions
total_counts = counts.groupby('Study_ID').sum(numeric_only = True)
counts['total'] = 0
for i in total_counts.itertuples(name = None):
    counts.loc[counts.Study_ID == i[0], 'total'] = i[1]
    counts['cell_proportion'] = counts['n_cells'] / counts.total

In [7]:
# Add in zero values for cell types not found in each sample
cell_types = counts.cell_type.unique()
for s in counts.Study_ID.unique():
    scounts = counts.loc[counts.Study_ID == s].copy()
    ct_missing = [x for x in cell_types if not x in scounts.cell_type.tolist()]
    x = scounts.iloc[0].copy()
    for ct in ct_missing:
        counts.loc[len(counts)] = [ct, x.Status, x.Study_ID, 0, x.total, 0]

In [8]:
counts.head()

Unnamed: 0,cell_type,Status,Study_ID,n_cells,total,cell_proportion
0,Secretory ciliated cells,RPRA,RPRA02,1206,7584,0.159019
1,Secretory ciliated cells,RPRA,RPRA03,1,3296,0.000303
2,Secretory ciliated cells,RPRA,RPRA05,231,3427,0.067406
3,Secretory ciliated cells,RPRA,RPRA06,28,11796,0.002374
4,Secretory ciliated cells,RPRA,RPRA07,11,10096,0.00109


### Write output

In [9]:
counts.to_csv(f"{data_dir}/deidentified_data/deidentified_NEP_cell_counts.csv", index = False)