### Goal

Generate input files for CIBERSORTx bulk deconvolution: expression matrix with custom cell type composition. For each specified cell type we'll sample up to 'n_cells' cells to include in the reference, to be used to generate a signature matrix in CIBERSORTx. Cells are sampled from a specified location 'tissue' (nose, airway, or parenchyma).

Notebook written by Tessa Gillett. 

In [1]:
# import libraries
import numpy as np
import pandas as pd
import scanpy as sc

### Set up

Specify "tissue" to subset reference set to anatomical location + location of HLCA object.

In [2]:
# set up & load HLCA data
tissue = "parenchyma"
n_cells = 200
adata = sc.read("../HLCA_v1.1_20220725.h5ad")
output_path = "../pseudobulk_analysis/"

#### Configure which cell types will be included. 

In [3]:
# key = the column in adata.obs (so level 1 - 5)
# value = the cell type as specified in that column
# value can be a list of multiple cell types if we want to merge those

# PARENCHYMA
include_cell_types = [
    ("ann_level_3_clean", "EC arterial"),    
    ("ann_level_3_clean", "EC capillary"),
    ("ann_level_3_clean", "EC venous"),    
    ("ann_level_2_clean", "Lymphatic EC"),
    ("ann_level_3_clean", ["Basal", "Secretory"]),    
    ("ann_level_3_clean", "Multiciliated lineage"),
    ("ann_level_3_clean", "AT2"),
    ("ann_level_3_clean", "B cell lineage"),
    ("ann_level_3_clean", ["Innate lymphoid cell NK", "T cell lineage"]),
    ("ann_level_3_clean", "Dendritic cells"),
    ("ann_level_4_clean", 'Alveolar macrophages'),
    ("ann_level_4_clean", "Interstitial macrophages"),
    ("ann_level_3_clean", "Mast cells"),
    ("ann_level_3_clean", "Monocytes"),
    ("ann_level_2_clean", "Fibroblast lineage"),
    ("ann_level_2_clean", "Smooth muscle")
]    

In [None]:
# NOSE
include_cell_types = [
    ("ann_level_4_clean", ["Basal resting", "Suprabasal"]),
    ("ann_level_4_clean", "Hillock-like"),
    ("ann_level_3_clean", "Multiciliated lineage"),
    ("ann_level_4_clean", "Club"),
    ("ann_level_4_clean", "Goblet"),
    ("ann_level_3_clean", "T cell lineage"),
    ("ann_level_3_clean", "Dendritic cells")
]

In [None]:
# AIRWAY
include_cell_types = [
    ("ann_level_3_clean", "EC capillary"), 
    ("ann_level_3_clean", "EC venous"),
    ("ann_level_3_clean", "B cell lineage"),
    ("ann_level_4_clean", "NK cells"),
    ("ann_level_4_clean", "CD4 T cells"),
    ("ann_level_4_clean", "CD8 T cells"),
    ("ann_level_3_clean", "Dendritic cells"),
    ("ann_level_4_clean", "Alveolar macrophages"),
    ("ann_level_4_clean", "Interstitial macrophages"),
    ("ann_level_3_clean", "Mast cells"),
    ("ann_level_3_clean", "Monocytes"),
    ("ann_level_3_clean", "Fibroblasts"),
    ("ann_level_2_clean", "Smooth muscle"),
    ("ann_level_3_clean", ["Basal", "Secretory"]), 
    ("ann_level_3_clean", "Multiciliated lineage") 
]

#### Run check for typos / existence of the specified cell types

In [4]:
# run check for typing errors
warning = False
for item in include_cell_types:
    
    if type(item[1]) == str: # if just the one cell type (simple)
        if item[1] not in adata.obs[item[0]].unique():
            print("Warning: " + item[1] + " not found in column " + item[0])
            warning = True
        
    else: # if list of cell types to be combined
        for cell_type in item[1]:
            if cell_type not in adata.obs[item[0]].unique():
                print("Warning: " + cell_type + " not found in column " + item[0])
                warning = True
                
if warning == False:
    print("All good!")

All good!


#### Subsample to n_cells cells per cell type

In [5]:
adata = adata[adata.obs['anatomical_region_level_1'] == tissue]

count_subsets = 0
for item in include_cell_types:
    
    # subset adata to relevant cell types
    if type(item[1]) == str:
        adata_subset = adata[adata.obs[item[0]] == item[1]].copy()
        adata_subset.obs['custom_label'] = item[1]
    else:
        adata_subset = adata[adata.obs[item[0]].isin(item[1])].copy()
        adata_subset.obs['custom_label'] = ' & '.join(item[1])
    
    # subsample to n_cells cells
    if adata_subset.obs.shape[0] > n_cells:
        sc.pp.subsample(adata_subset, n_obs=n_cells, random_state=0)
    
    # merge with previous cell types' data
    if count_subsets == 0:
        adata_subsampled = adata_subset
    else:
        adata_subsampled = adata_subsampled.concatenate(adata_subset)
    
    count_subsets +=1

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_a

#### Save cell counts matrix with HGNC gene names

In [6]:
adata = adata_subsampled
    
# write scRNA-seq counts matrix with cols = samples and rows = genes
counts_layer = pd.DataFrame(adata.layers['counts'].todense(), index=adata.to_df().index, 
                            columns=adata.to_df().columns)    
counts_layer = counts_layer.transpose()

# get labels as first row (calling it GeneSymbols to get the colname of the first col right)
# (not entirely sure if that's neccessary, but let's run with it..)
adata.obs['GeneSymbols'] = adata.obs['custom_label']
counts_withInfo = pd.DataFrame(adata.obs['GeneSymbols']).transpose()
counts_withInfo = counts_withInfo.append(counts_layer)

# write
counts_withInfo.to_csv(output_path + "/" + tissue + "_subsampled_matrix_max" + str(n_cells) + "cells_HUGO.txt", 
                       header=False, sep='\t')
print("done " + tissue)

  counts_withInfo = counts_withInfo.append(counts_layer)


done parenchyma


#### Save cell counts matrix with Ensembl gene names

In [7]:
adata = adata_subsampled
        
# write scRNA-seq counts matrix with cols = samples and rows = genes
counts_layer = pd.DataFrame(adata.layers['counts'].todense(), index=adata.to_df().index, 
                            columns=adata.var['gene_ids'])    
counts_layer = counts_layer.transpose()

# get labels as first row (calling it GeneSymbols to get the colname of the first col right)
# (not entirely sure if that's neccessary, but let's run with it..)
adata.obs['GeneSymbols'] = adata.obs['custom_label']
counts_withInfo = pd.DataFrame(adata.obs['GeneSymbols']).transpose()
counts_withInfo = counts_withInfo.append(counts_layer)

# write
counts_withInfo.to_csv(output_path + "/" + tissue + "_subsampled_matrix_max" + str(n_cells) + "cells_ENSG.txt", 
                       header=False, sep='\t')
print("done " + tissue)

  counts_withInfo = counts_withInfo.append(counts_layer)


done parenchyma
