# BAL Pseudobulk

Compute pseudobulk counts for BAL samples.

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import math
from collections import Counter

In [2]:
data_dir = "/projects/b1038/Pulmonary/cpuritz/PASC/data/01BAL"
v = "GEO_v2"
adata = sc.read_h5ad(f"{data_dir}/01integrated_BAL_{v}/01integrated_BAL_{v}.h5ad")
adata_raw = sc.read_h5ad(f"{data_dir}/raw/adata_raw_{v}.h5ad")

In [3]:
cell_types = adata.obs.cell_type.unique()
study_ids = adata.obs.Study_ID.unique()
gene_names = adata_raw.var.index.tolist()

In [4]:
# Only keep samples that have at least 40 cells and contribute 1% of each cluster
MIN_CELLS = 40
CUTOFF = 0.01

# Skip epithelial and SARS-CoV-2 cells
cell_types = [x for x in cell_types if x not in ["Epithelial cells", "SARS-CoV-2"]]

cell_files = []
for ct in cell_types:
    obs_ct = adata.obs[adata.obs.cell_type == ct]
    cutoff = math.floor(CUTOFF * obs_ct.shape[0])

    df_counts = pd.DataFrame()
    df_cells = pd.DataFrame()
    df_meta = pd.DataFrame()
    statuses = []
    for sid in study_ids:
        cells = obs_ct[obs_ct.Study_ID == sid].index
        data = adata_raw[adata_raw.obs.index.isin(cells)]
        
        nc = data.n_obs
        if nc > max(MIN_CELLS, cutoff):
            # Number of cells of specified type in sample, status of sample
            st = 'RPRA' if (adata.obs.loc[adata.obs.Study_ID == sid, 'is_RPRA'][0] == 'True') else 'Healthy'
            df_meta = pd.concat([df_meta, pd.DataFrame([[nc, st]], columns = ['cells', 'status'], index = [sid])])

            # Total counts for each gene
            df_counts = pd.concat([df_counts,
                                   pd.DataFrame(np.sum(data.X, axis = 0), columns = gene_names, index = [sid])])

            # Number of cells with nonzero expression of each gene
            df_cells = pd.concat([df_cells,
                                  pd.DataFrame(np.sum(data.X > 0, axis = 0), columns = gene_names, index = [sid])])   

    fname = ct.lower().replace(' - ', '_').replace(', ', '_').replace(' ', '_')
    cell_files.append(fname)

    df_meta.to_csv(f"{data_dir}/DEG/counts/{fname}-meta.csv")
    df_counts.T.to_csv(f"{data_dir}/DEG/counts/{fname}.csv", sep = '\t')
    df_cells.T.to_csv(f"{data_dir}/DEG/counts/{fname}-n_cells.csv", sep = '\t')

    print(f"{ct}: {df_counts.shape[0]}/{len(study_ids)}")

cell_df = pd.DataFrame(data = {'type' : cell_types, 'file' : cell_files})
cell_df.to_csv(f"{data_dir}/DEG/counts/cell_names.csv", index = False)

TRAM-3: 20/30
CD4 T cells-1: 20/30
MoAM-1: 21/30
TRAM-1: 23/30
TRAM-4: 21/30
DC2: 25/30
Proliferating T cells: 7/30
CD8 T cells-1: 16/30
TRAM-2: 20/30
MoAM-4: 21/30
CD4 T cells-2: 19/30
MoAM-2: 22/30
Proliferating macrophages: 24/30
pDC: 6/30
TRAM-5: 25/30
CD8 T cells-3: 21/30
MoAM-3: 23/30
Perivascular macrophages: 24/30
TRAM-6: 23/30
TRAM-7: 24/30
Monocytes-1: 4/30
Monocytes-2: 8/30
CD8 T cells-2: 18/30
B cells: 13/30
gdT cells and NK cells: 22/30
DC1: 14/30
Migratory DC: 10/30
Tregs: 21/30
Mast cells: 4/30
Plasma cells: 1/30
