# NEP Pseudobulk

Compute pseudobulk counts for NEP samples.

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import math
from collections import Counter

In [2]:
data_dir = "/projects/b1038/Pulmonary/cpuritz/PASC/data/01NEP"
v = "GEO_v2"
adata = sc.read_h5ad(f"{data_dir}/01integrated_NEP_{v}/01integrated_NEP_{v}.h5ad")
adata_raw = sc.read_h5ad(f"{data_dir}/raw/adata_raw_{v}.h5ad")

In [3]:
cell_types = adata.obs.cell_type.unique()
study_ids = adata.obs.Study_ID.unique()
gene_names = adata_raw.var.index.tolist()

In [4]:
# Only keep samples that have at least 40 cells and contribute 1% of each cluster
MIN_CELLS = 40
CUTOFF = 0.01

# Skip erythrocytes
cell_types = [x for x in cell_types if x != 'Erythrocytes']

cell_files = []
for ct in cell_types:
    obs_ct = adata.obs[adata.obs.cell_type == ct]
    cutoff = math.floor(CUTOFF * obs_ct.shape[0])

    df_counts = pd.DataFrame()
    df_cells = pd.DataFrame()
    df_meta = pd.DataFrame()
    statuses = []
    for sid in study_ids:
        cells = obs_ct[obs_ct.Study_ID == sid].index
        data = adata_raw[adata_raw.obs.index.isin(cells)]
        
        nc = data.n_obs
        if nc > max(MIN_CELLS, cutoff):
            # Number of cells of specified type in sample, status of sample
            st = adata.obs.loc[adata.obs.Study_ID == sid, 'Status'][0]
            df_meta = pd.concat([df_meta, pd.DataFrame([[nc, st]], columns = ['cells', 'status'], index = [sid])])

            # Total counts for each gene
            df_counts = pd.concat([df_counts,
                                   pd.DataFrame(np.sum(data.X, axis = 0), columns = gene_names, index = [sid])])

            # Number of cells with nonzero expression of each gene
            df_cells = pd.concat([df_cells,
                                  pd.DataFrame(np.sum(data.X > 0, axis = 0), columns = gene_names, index = [sid])])   

    fname = ct.lower().replace(' - ', '_').replace(', ', '_').replace(' ', '_')
    cell_files.append(fname)

    df_meta.to_csv(f"{data_dir}/DEG/counts/{fname}-meta.csv")
    df_counts.T.to_csv(f"{data_dir}/DEG/counts/{fname}.csv", sep = '\t')
    df_cells.T.to_csv(f"{data_dir}/DEG/counts/{fname}-n_cells.csv", sep = '\t')

    print(f"{ct}: {df_counts.shape[0]}/{len(study_ids)}")

cell_df = pd.DataFrame(data = {'type' : cell_types, 'file' : cell_files})
cell_df.to_csv(f"{data_dir}/DEG/counts/cell_names.csv", index = False)

Secretory ciliated cells: 7/11
Secretory cells: 10/11
Ciliated cells: 8/11
Mucous cells: 5/11
Ionocytes: 2/11
Suprabasal cells: 11/11
Basal cells: 9/11
Deuterosomal cells: 1/11
gdT cells: 1/11
Proliferating basal cells: 10/11
Macrophages: 5/11
Squamous cells: 10/11
CD8 T cells: 5/11
CD4 T cells: 4/11
pDC: 1/11
Mast cells: 1/11
Monocytes: 1/11
DC2: 2/11
B cells: 2/11
MMP9 basal cells: 4/11
Fibroblasts: 0/11
