### Script: export pseudobulk sample data for CIBERSORTx deconvolution

Here we create pseudobulks from HLCA samples to test bulk deconvolution using HLCA-based cell type signature matrices on.

Notebook written by Tessa Gillett.

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
def pseudobulk(adata, groupby):
    
    # groupby is a list of names of adata.obs columns used to split cells into pseudobulk samples
    # e.g. groupby = ['donor', 'celltype'] --> each pseudobulk 'sample' is the aggregation of a given celltype 
    # for a given donor
    
    # add metadata column for pseudobulk 'sample'
    first_col = True
    for col in groupby: 
        if first_col == True:
            adata.obs['pseudobulk_sample'] = adata.obs[col].astype(str)
            first_col = False
        else:
            adata.obs['pseudobulk_sample'] = adata.obs['pseudobulk_sample'] + '_' + adata.obs[col].astype(str)
    
    # get subset of raw count matrix for each 'sample' --> sum into single pseudobulk column.
    first_label = True
    for label in adata.obs['pseudobulk_sample'].unique():
        bool_mask = adata.obs['pseudobulk_sample'] == label # boolean mask for cells of pseudobulk sample
        subset_counts = adata.layers['counts'][bool_mask,] # subset count matrix for pseudobulk sample

        # sum values over all cells in 'sample' for each feature:
        subset_pseudobulk = pd.DataFrame(subset_counts.sum(axis=0).transpose(), columns=[label])
    
        # merge with the other samples
        if first_label == True:
            pseudobulk_matrix = subset_pseudobulk
            first_label = False
        else:
            pseudobulk_matrix = pd.concat([pseudobulk_matrix, subset_pseudobulk], axis=1)
        
        
    # add feature names
    pseudobulk_matrix.index = adata.var.index

    return pseudobulk_matrix

### Reading in & inspecting HLCA data

In [4]:
# read in annData object
adata = sc.read("../data/HLCA_v1.1_20220725.h5ad")
adata

AnnData object with n_obs × n_vars = 584944 × 28527
    obs: 'sample', 'original_celltype_ann', 'study_long', 'study', 'last_author_PI', 'subject_ID', 'subject_ID_as_published', 'pre_or_postnatal', 'age_in_years', 'age_range', 'sex', 'ethnicity', 'mixed_ethnicity', 'smoking_status', 'smoking_history', 'BMI', 'known_lung_disease', 'condition', 'subject_type', 'cause_of_death', 'sample_type', 'anatomical_region_coarse', 'anatomical_region_detailed', 'tissue_dissociation_protocol', 'cells_or_nuclei', 'single_cell_platform', "3'_or_5'", 'enrichment', 'sequencing_platform', 'reference_genome_coarse', 'ensembl_release_reference_genome', 'cell_ranger_version', 'disease_status', 'fresh_or_frozen', 'cultured', 'cell_viability_%', 'comments', 'Processing_site', 'dataset', 'anatomical_region_level_1', 'anatomical_region_level_2', 'anatomical_region_level_3', 'anatomical_region_highest_res', 'age', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_n

In [None]:
# which studies do we have?
adata.obs['study'].unique()

### Get samples for airway

In [5]:
# check which samples we have
adata_airway = adata[adata.obs['study'].isin(['Nawijn_2021', 'Seibold_2020'])]
# adata_airway.obs.groupby('sample')['sample'].count()

In [6]:
# airway samples from Nawijn '21 only, all from Seibold:
samples_airway = ['GRO-01_biopsy', 'GRO-02_biopsy', 'GRO-03_biopsy', 'GRO-04_biopsy', 'GRO-05_biopsy', 
                  'GRO-06_biopsy', 'GRO-07_biopsy', 'GRO-08_biopsy', 'GRO-09_biopsy', 'GRO-10_biopsy',
                  'GRO-11_biopsy', 'T85', 'T89', 'T90', 'T101', 'T120', 'T121', 'T126', 'T137', 'T153',
                  'T154', 'T164', 'T165', 'T166', 'T167']
adata_airway = adata_airway[adata_airway.obs['sample'].isin(samples_airway)]

### Get samples for nose

In [9]:
# check which samples we have
adata_nose = adata[adata.obs['study'].isin(['Jain_Misharin_2021', 'Barbry_Leroy_2020'])]
# adata_nose.obs.groupby('sample')['sample'].count()

In [None]:
# pd.crosstab(adata_nose[adata_nose.obs['study']=='Jain_Misharin_2021'].obs['sample'], 
#             adata_nose.obs['single_cell_platform'])

In [None]:
# pd.crosstab(adata_nose[adata_nose.obs['study']=='Barbry_Leroy_2020'].obs['sample'], 
#             adata_nose.obs['anatomical_region_coarse'])

In [10]:
# v2 samples from Jain_Misharin, nasal samples from Barbry_Leroy
samples_nose = ['SC181', 'SC182', 'SC183', 'SC184', 'SC185', 'D322_Biop_Nas1', 'D339_Biop_Nas1', 'D344_Biop_Nas1',
               'D353_Brus_Nas1', 'D363_Brus_Nas1', 'D367_Brus_Nas1', 'D372_Brus_Nas1']
adata_nose = adata_nose[adata_nose.obs['sample'].isin(samples_nose)]

### Get samples for parenchyma

In [12]:
adata_parenchyma = adata[adata.obs['study'].isin(['Banovich_Kropski_2020', 'Misharin_Budinger_2018'])]
# adata_parenchyma.obs.groupby('sample')['sample'].count()

In [None]:
# # which samples do we have?
# pd.crosstab(adata_parenchyma.obs['sample'], adata_parenchyma.obs['anatomical_region_coarse'])
# pd.crosstab(adata_parenchyma.obs['sample'], adata_parenchyma.obs['study'])

In [13]:
# all from Misharin_Budinger_2018, 10 samples from Banovich_Kropski_2020 with >2500 cells
samples_parenchyma = ['SC07', 'SC10', 'SC18', 'SC20', 'SC22', 'SC24', 'SC27', 'SC29', 'VUHD101', 'VUHD68', 
                      'VUHD67', 'VUHD66', 'F02617', 'F02611', 'F02607', 'F02528', 'F02526', 'F02524']

In [14]:
adata_parenchyma = adata_parenchyma[adata_parenchyma.obs['sample'].isin(samples_parenchyma)]

### Create pseudobulk data

In [None]:
# create pseudobulk with samples splitted by categories listed in the second argument
pseudobulk_airway = pseudobulk(adata_airway, ['sample'])

# write pseudobulk raw count matrix 
pseudobulk_airway.to_csv('../data/pseudobulk_data/pseudobulk_airway.csv')  

In [None]:
# create pseudobulk with samples splitted by categories listed in the second argument
pseudobulk_nose = pseudobulk(adata_nose, ['sample'])

# write pseudobulk raw count matrix 
pseudobulk_nose.to_csv('../data/pseudobulk_data/pseudobulk_nose.csv')  

In [None]:
# create pseudobulk with samples splitted by categories listed in the second argument
pseudobulk_parenchyma = pseudobulk(adata_parenchyma, ['sample'])

# write pseudobulk raw count matrix 
pseudobulk_parenchyma.to_csv('../data/pseudobulk_data/pseudobulk_parenchyma.csv')  