### Getting cell count data for pseudobulk samples

In this script we determine the ground truth cell type proportions for the pseudobulks created in notebook 10. 

Script written by Tessa Gillett.

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

# read in annData object
adata = sc.read("../HLCA_v1.1_20220725.h5ad")

#### Get airway pseudobulk subset

In [2]:
adata_airway = adata[adata.obs['study'].isin(['Nawijn_2021', 'Seibold_2020'])]

samples_airway = ['GRO-01_biopsy', 'GRO-02_biopsy', 'GRO-03_biopsy', 'GRO-04_biopsy', 'GRO-05_biopsy', 
                  'GRO-06_biopsy', 'GRO-07_biopsy', 'GRO-08_biopsy', 'GRO-09_biopsy', 'GRO-10_biopsy',
                  'GRO-11_biopsy', 'T85', 'T89', 'T90', 'T101', 'T120', 'T121', 'T126', 'T137', 'T153',
                  'T154', 'T164', 'T165', 'T166', 'T167']

adata_airway = adata_airway[adata_airway.obs['sample'].isin(samples_airway)]

#### Get nose pseudobulk subset

In [None]:
adata_nose = adata[adata.obs['study'].isin(['Jain_Misharin_2021', 'Barbry_Leroy_2020'])]

samples_nose = ['SC181', 'SC182', 'SC183', 'SC184', 'SC185', 'D322_Biop_Nas1', 'D339_Biop_Nas1', 'D344_Biop_Nas1',
               'D353_Brus_Nas1', 'D363_Brus_Nas1', 'D367_Brus_Nas1', 'D372_Brus_Nas1']

adata_nose = adata_nose[adata_nose.obs['sample'].isin(samples_nose)]

#### Get parenchyma pseudobulk subset

In [None]:
adata_parenchyma = adata[adata.obs['study'].isin(['Banovich_Kropski_2020', 'Misharin_Budinger_2018'])]

samples_parenchyma = ['SC07', 'SC10', 'SC18', 'SC20', 'SC22', 'SC24', 'SC27', 'SC29', 'VUHD101', 'VUHD68', 
                      'VUHD67', 'VUHD66', 'F02617', 'F02611', 'F02607', 'F02528', 'F02526', 'F02524']

adata_parenchyma = adata_parenchyma[adata_parenchyma.obs['sample'].isin(samples_parenchyma)]

#### Add custom_label annotations

In [None]:
# key = the column in adata.obs (so level 1 - 5)
# value = the cell type as specified in that column
# value can be a list of multiple cell types if we want to merge those

# PARENCHYMA
include_cell_types_parenchyma = [
    ("ann_level_3_clean", "EC arterial"),    
    ("ann_level_3_clean", "EC capillary"),
    ("ann_level_3_clean", "EC venous"),    
    ("ann_level_2_clean", "Lymphatic EC"),
    ("ann_level_3_clean", ["Basal", "Secretory"]),    
    ("ann_level_3_clean", "Multiciliated lineage"),
    ("ann_level_3_clean", "AT2"),
    ("ann_level_3_clean", "B cell lineage"),
    ("ann_level_3_clean", ["Innate lymphoid cell NK", "T cell lineage"]),
    ("ann_level_3_clean", "Dendritic cells"),
    ("ann_level_4_clean", 'Alveolar macrophages'),
    ("ann_level_4_clean", "Interstitial macrophages"),
    ("ann_level_3_clean", "Mast cells"),
    ("ann_level_3_clean", "Monocytes"),
    ("ann_level_2_clean", "Fibroblast lineage"),
    ("ann_level_2_clean", "Smooth muscle")
]    

In [None]:
# NOSE
include_cell_types_nose = [
    ("ann_level_4_clean", ["Basal resting", "Suprabasal"]),
    ("ann_level_4_clean", "Hillock-like"),
    ("ann_level_3_clean", "Multiciliated lineage"),
    ("ann_level_4_clean", "Club"),
    ("ann_level_4_clean", "Goblet"),
    ("ann_level_3_clean", "T cell lineage"),
    ("ann_level_3_clean", "Dendritic cells")
]

In [3]:
# AIRWAY
include_cell_types_airway = [
    ("ann_level_3_clean", "EC capillary"), 
    ("ann_level_3_clean", "EC venous"),
    ("ann_level_3_clean", "B cell lineage"),
    ("ann_level_4_clean", "NK cells"),
    ("ann_level_4_clean", "CD4 T cells"),
    ("ann_level_4_clean", "CD8 T cells"),
    ("ann_level_3_clean", "Dendritic cells"),
    ("ann_level_4_clean", "Alveolar macrophages"),
    ("ann_level_4_clean", "Interstitial macrophages"),
    ("ann_level_3_clean", "Mast cells"),
    ("ann_level_3_clean", "Monocytes"),
    ("ann_level_3_clean", "Fibroblasts"),
    ("ann_level_2_clean", "Smooth muscle"),
    ("ann_level_3_clean", ["Basal", "Secretory"]), 
    ("ann_level_3_clean", "Multiciliated lineage") 
]

In [4]:
def write_count_file(adata, outfile, include_cell_types):
    
    samples = adata.obs['sample'].unique()
    
    with open(outfile, 'w') as f:
        f.write("sample,custom_label,sample\n") # header
        
        for i in include_cell_types:
            for s in samples:
                
                if type(i[1]) == str:
                    nr = adata.obs[(adata.obs['sample'] == s) & (adata.obs[i[0]] == i[1])].shape[0]
                    f.write(s + ',' + i[1] + ',' + str(nr) + '\n')
                else:
                    nr = adata.obs[(adata.obs['sample'] == s) & (adata.obs[i[0]].isin( i[1]))].shape[0]
                    f.write(s + ',' + ' & '.join(i[1]) + ',' + str(nr) + '\n')
                
        

In [None]:
write_count_file(adata_parenchyma, 
                 '../pseudobulk_analysis/cell_counts_pseudobulk_samples/cell_counts_pseudobulk_parenchyma.csv',
                include_cell_types_parenchyma)

In [None]:
write_count_file(adata_nose, 
                 '../pseudobulk_analysis/cell_counts_pseudobulk_samples/cell_counts_pseudobulk_nose.csv',
                include_cell_types_nose)

In [5]:
write_count_file(adata_airway, 
                 '../pseudobulk_analysis/cell_counts_pseudobulk_samples/cell_counts_pseudobulk_airway.csv',
                include_cell_types_airway)

Note: the rest of the deconvolution analysis is performed in bash and R. See bash and R subdirectories in scripts directory of GitHub repo.

Order of scripts:
1. [../../scripts/bash/bulk_deconvolution/convolution_pseudobulks.sh](../../scripts/bash/bulk_deconvolution/convolution_pseudobulks.sh) - commands used to do the deconvolutions on the pseudobulk data<br>
2. [../../scripts/R/bulk_deconvolution/process_pseudobulk_deconvolution.R](../../scripts/R/bulk_deconvolution/process_pseudobulk_deconvolution.R) - code to look at the pseudobulk deconvolution results, e.g. generate the plots with the correlations between deconvolution proportions and ground truth<br>
3. [../../scripts/R/preprocess_bulk_data.R](../../scripts/R/preprocess_bulk_data.R) - filter/prep true bulk data as received to feed into CIBERSORTx<br>
4. [../../scripts/bash/deconvolution_bulk_data.sh](../../scripts/bash/deconvolution_bulk_data.sh) - commands used to do the bulk deconvolutions<br>
5. [../../scripts/R/process_bulk_deconvolution.R](../../scripts/R/process_bulk_deconvolution.R) - code to look at the results: kicking out >60% zero cell types, wilcoxon rank sums, plotting results etc.