# Subsampling from the HLCA core to prepare basis for deconvolution of bulk RNA-seq (or microarray) based on HLCA cell type transcriptomes

import modules:

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import os

for pretty code formatting (not necessary to run code:)

In [2]:
%load_ext lab_black

Set paths

In [3]:
path_HLCA = "../../data/HLCA_core_h5ads/HLCA_v2.h5ad"
path_celltype_to_anatomical_location = "../../supporting_files/celltype_anatomical_location/cell_type_mapping_to_anatomical_location.csv"
dir_out = "../../data/HLCA_core_h5ads/deconvolution_basis/"

Read files:

In [4]:
adata = sc.read_h5ad(path_HLCA)

anatomical location info (this table does not include HSCs, but those are too rare to detect anyway):

In [5]:
ct_to_an = pd.read_csv(path_celltype_to_anatomical_location, index_col=0).loc[
    :, ["Nose", "Airway", "Parenchyma"]
]
ct_to_an.columns = [col.lower() for col in ct_to_an.columns]  # convert to lowercase

Now subsample per cell type, once for every anatomical location:

In [6]:
subsample_size = 1000
min_n_cells = 100

In [7]:
adatas_sub = dict()

## Subsample per anatomical location:

In [8]:
verbose = True

In [9]:
for anatomical_location in adata.obs.anatomical_region_level_1.unique():
    print(f"Working on samples from {anatomical_location}")
    cells_to_keep = list()
    adata_region = adata[
        adata.obs.anatomical_region_level_1 == anatomical_location, :
    ].copy()
    cts_to_keep = ct_to_an.loc[ct_to_an[anatomical_location] == "x", :].index.tolist()
    cell_type_counts = adata_region.obs.manual_ann.value_counts()
    for ct in cts_to_keep:
        if ct in cell_type_counts.index:  # check if ct exists in data
            ct_count = cell_type_counts[ct]
            # subsample if enough cells are present
            if ct_count > subsample_size:
                if verbose:
                    print(f"Subsampling for cell type {ct}.")
                ct_sample = list(
                    np.random.choice(
                        adata_region.obs.loc[
                            adata_region.obs.manual_ann == ct, :
                        ].index.tolist(),
                        size=subsample_size,
                        replace=False,
                    )
                )
            # include cells from same cell type but other location if not enough cells are present
            else:
                if verbose:
                    print(
                        f"Not enough cells of {ct} in {anatomical_location}. Taking cells from the other anatomical regions."
                    )
                ct_sample_part_1 = adata_region.obs.loc[
                    adata_region.obs.manual_ann == ct, :
                ].index.tolist()
                n_cells_to_add = subsample_size - len(ct_sample_part_1)
                ct_cells_from_other_regions = adata.obs.loc[
                    [
                        not_from_region and from_ct
                        for not_from_region, from_ct in zip(
                            ~adata.obs.index.isin(adata_region.obs.index),
                            adata.obs.manual_ann == ct,
                        )
                    ],
                    :,
                ].index.tolist()
                if len(ct_cells_from_other_regions) <= n_cells_to_add:
                    ct_sample_part_2 = ct_cells_from_other_regions
                else:
                    ct_sample_part_2 = list(
                        np.random.choice(
                            ct_cells_from_other_regions,
                            size=n_cells_to_add,
                            replace=False,
                        )
                    )
                ct_sample = ct_sample_part_1 + ct_sample_part_2

        else:
            if ct not in adata.obs.manual_ann.unique():
                raise ValueError(
                    f"{ct} not in anndata? Check this (might be a typo in the google doc). Exiting."
                )
            else:
                # sampe from other anatomical regions:
                if verbose:
                    print(
                        f"No cells from {ct} in this anatomical region. Taking all cells from another region."
                    )
                ct_cells_from_other_regions = adata.obs.loc[
                    adata.obs.manual_ann == ct, :
                ].index.tolist()
                if len(ct_cells_from_other_regions) < subsample_size:
                    ct_sample = ct_cells_from_other_regions
                else:
                    ct_sample = list(
                        np.random.choice(
                            ct_cells_from_other_regions,
                            size=subsample_size,
                            replace=False,
                        )
                    )
        cells_to_keep += ct_sample
    adatas_sub[anatomical_location] = adata[cells_to_keep, :].copy()

Working on samples from parenchyma
Subsampling for cell type AT0.
Subsampling for cell type AT1.
Subsampling for cell type AT2.
Not enough cells of AT2 proliferating in parenchyma. Taking cells from the other anatomical regions.
Subsampling for cell type Adventitial fibroblasts.
Subsampling for cell type Alveolar Mph CCL3+.
Subsampling for cell type Alveolar Mph MT-positive.
Subsampling for cell type Alveolar Mph proliferating.
Subsampling for cell type Alveolar fibroblasts.
Subsampling for cell type Alveolar macrophages.
Subsampling for cell type B cells.
Subsampling for cell type CD4 T cells.
Subsampling for cell type CD8 T cells.
Subsampling for cell type Classical monocytes.
Not enough cells of Club (non-nasal) in parenchyma. Taking cells from the other anatomical regions.
Not enough cells of DC1 in parenchyma. Taking cells from the other anatomical regions.
Subsampling for cell type DC2.
Not enough cells of Deuterosomal in parenchyma. Taking cells from the other anatomical regions

Kick out cell types with fewer than min_n_cells cells:

In [10]:
for region, adata_region in adatas_sub.items():
    ct_counts_region = adata_region.obs.manual_ann.value_counts()
    cts_to_remove = ct_counts_region.loc[ct_counts_region < min_n_cells].index.tolist()
    print(f"Removing cts {cts_to_remove} for region {region}.")
    adatas_sub[region] = adatas_sub[region][
        ~adatas_sub[region].obs.manual_ann.isin(cts_to_remove), :
    ].copy()

Removing cts ['Hematopoietic stem cells', 'Lymphatic EC proliferating'] for region parenchyma.
Removing cts ['Hematopoietic stem cells', 'Lymphatic EC proliferating'] for region airway.
Removing cts ['Hematopoietic stem cells'] for region nose.


Store results:

In [11]:
for region, adata_region in adatas_sub.items():
    adata_region.write(
        os.path.join(dir_out, f"HLCA_core_subsampled_for_{region}_deconvolution.h5ad")
    )