### Generation of cell type signature matrices for deconvolution

Here we generate input files for CIBERSORTx deconvolution: expression matrix with custom cell type composition. For each specified cell type we'll sample up to 'n_cells' cells to include in the reference (default: 200), to be used to generate a signature matrix in CIBERSORTx. Cells are sampled from a specified location (nose, airway, or parenchyma). The parameters here are the ones used for the paper, but the script can be adapted to tailer the matrices to any specific research question and bulk dataset by adjusting parameter settings/cell type subselection etc.

In [1]:
# import libraries
import numpy as np
import pandas as pd
import scanpy as sc

set paths:

In [None]:
path_hlca = "../../data/HLCA_core_h5ads/HLCA_v2.h5ad"
output_path = "../../results/deconvolution/signature_matrices" # no trailing slash

### Set up

Specify "location" to subset reference set to anatomical location to sample from + specify file location of HLCA object. "n_cells" is number of cells included in the reference per cell type. "gene_IDs" refers to the format: Ensembl ID or HGNC symbol (specify "HGNC" or "ENSG" respectively), to match the bulk data.

In [2]:
# set up & load HLCA data
location = "parenchyma"
adata = sc.read(path_hlca)
n_cells = 200
gene_IDs = "HGNC" # "HGNC" or "ENSG"

#### Configure which cell types will be included. 

In [3]:
# key = the column in adata.obs (so level 1 - 5)
# value = the cell type as specified in that column
# value can be a list of multiple cell types if we want to merge those

# NOSE example:
include_cell_types = [
    ("ann_level_4_clean", ["Basal resting", "Suprabasal"]),
    ("ann_level_4_clean", "Hillock-like"),
    ("ann_level_3_clean", "Multiciliated lineage"),
    ("ann_level_4_clean", "Club"),
    ("ann_level_4_clean", "Goblet"),
    ("ann_level_3_clean", "T cell lineage"),
    ("ann_level_3_clean", "Dendritic cells")
]

#### Run check for typos / existence of the specified cell types

In [4]:
# run check for typing errors
warning = False
for item in include_cell_types:
    
    if type(item[1]) == str: # if just the one cell type (simple)
        if item[1] not in adata.obs[item[0]].unique():
            print("Warning: " + item[1] + " not found in column " + item[0])
            warning = True
        
    else: # if list of cell types to be combined
        for cell_type in item[1]:
            if cell_type not in adata.obs[item[0]].unique():
                print("Warning: " + cell_type + " not found in column " + item[0])
                warning = True
                
if warning == False:
    print("All good!")

All good!


#### Subsample to n_cells cells per cell type

In [5]:
adata = adata[adata.obs['anatomical_region_level_1'] == location]

count_subsets = 0
for item in include_cell_types:
    
    # subset adata to relevant cell types
    if type(item[1]) == str:
        adata_subset = adata[adata.obs[item[0]] == item[1]].copy()
        adata_subset.obs['custom_label'] = item[1]
    else:
        adata_subset = adata[adata.obs[item[0]].isin(item[1])].copy()
        adata_subset.obs['custom_label'] = ' & '.join(item[1])
    
    # subsample to n_cells cells
    if adata_subset.obs.shape[0] > n_cells:
        sc.pp.subsample(adata_subset, n_obs=n_cells, random_state=0)
    
    # merge with previous cell types' data
    if count_subsets == 0:
        adata_subsampled = adata_subset
    else:
        adata_subsampled = adata_subsampled.concatenate(adata_subset)
    
    count_subsets +=1

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],
  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


#### Save cell counts matrix with HGNC gene names

In [6]:
adata = adata_subsampled
    
# write scRNA-seq counts matrix with cols = samples and rows = genes
counts_layer = pd.DataFrame(adata.layers['counts'].todense(), index=adata.to_df().index, 
                            columns=adata.to_df().columns)    
counts_layer = counts_layer.transpose()

# get labels as first row (calling it GeneSymbols to get the colname of the first col right)
# (not entirely sure if that's neccessary, but let's run with it..)
adata.obs['GeneSymbols'] = adata.obs['custom_label']
counts_withInfo = pd.DataFrame(adata.obs['GeneSymbols']).transpose()
counts_withInfo = counts_withInfo.append(counts_layer)

# write
counts_withInfo.to_csv(output_path + "/" + location + "_subsampled_matrix_max" + str(n_cells) + "cells_HGNC.txt", 
                       header=False, sep='\t')
print("done " + location)

  counts_withInfo = counts_withInfo.append(counts_layer)


done parenchyma


#### Save cell counts matrix with Ensembl gene names

In [7]:
adata = adata_subsampled
        
# write scRNA-seq counts matrix with cols = samples and rows = genes
counts_layer = pd.DataFrame(adata.layers['counts'].todense(), index=adata.to_df().index, 
                            columns=adata.var['gene_ids'])    
counts_layer = counts_layer.transpose()

# get labels as first row (calling it GeneSymbols to get the colname of the first col right)
# (not entirely sure if that's neccessary, but let's run with it..)
adata.obs['GeneSymbols'] = adata.obs['custom_label']
counts_withInfo = pd.DataFrame(adata.obs['GeneSymbols']).transpose()
counts_withInfo = counts_withInfo.append(counts_layer)

# write
counts_withInfo.to_csv(output_path + "/" + location + "_subsampled_matrix_max" + str(n_cells) + "cells_ENSG.txt", 
                       header=False, sep='\t')
print("done " + location)

  counts_withInfo = counts_withInfo.append(counts_layer)


done parenchyma
