## Preparation of HLCA extension datasets for mapping to the HLCA core:

We'll use files as have been prepared in the notebooks in the HLA_extension_data_preprocessing folder as a basis for the HLCA extension. Here we perform final checks, as well as further cleaning and harmonization of metadata.

TO DO for revisions: complete info on single cell platform and sample types.

### Import modules, set paths:

In [1]:
import scanpy as sc
import os
import pandas as pd

for nice formatting of code (not necessary):

In [2]:
%load_ext lab_black

set paths:

In [1]:
dir_data = "../../data/HLCA_extended/extension_datasets/ready/subsetted/"
path_output_atlas_extension = "../../data/HLCA_extended/query_dataset_scarches_input.h5ad"

get file names to import:

In [3]:
files_to_import = sorted(os.listdir(dir_data))

### Read in all files and check if integers:

read in prepared anndata objects:

In [4]:
verbose = True
query_adatas = dict()
for dataset in files_to_import:
    dataset_name = dataset.split("_sub.h5ad")[0].split(".h5ad")[0]
    query_adatas[dataset_name] = sc.read(f"../query_datasets/ready/subsetted/{dataset}")
    if dataset_name == "lukassen":
        if verbose:
            print(
                "rounding counts for lukassen dataset, the counts had slight rounding errors leading to non-integer values"
            )
            print("pre:")
            print(query_adatas[dataset_name].X[:10, :10].toarray())
        query_adatas[dataset_name].X = round(query_adatas[dataset_name].X)
        if verbose:
            print("post:")
            print(query_adatas[dataset_name].X[:10, :10].toarray())
    if verbose and dataset_name != "lukassen":
        print(dataset_name, query_adatas[dataset_name].shape)
        print(query_adatas[dataset_name].X[:10, :10].toarray())

adams (307650, 2000)
[[2. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [5. 0. 0. 0. 0. 0. 2. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 4. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [3. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 8. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
barbry (100211, 2000)
[[ 0.  0.  0.  0.  0.  3.  0.  0.  0.  0.]
 [ 1.  2.  0.  2.  0. 22.  0.  0.  0.  2.]
 [ 0.  1.  1.  0.  0.  7.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0. 21.  0.  0.  0.  0.]
 [ 1.  2.  0.  1.  0. 57.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0. 49.  0.  2.  0.  0.]
 [ 1.  0.  0.  1.  0. 48.  0.  1.  0.  1.]
 [ 0.  0.  2.  0.  0. 62.  0.  0.  0.  1.]]
bharat (91980, 2000)
[[ 0.  0.  0.  1.  0.  0.  1.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  2.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  2.  0.  4.  1.  0.  0.]
 [

### Add, filter and clean metadata:

rename and filter adata.obs variables:

In [5]:
obs_to_rename = {
    "cond": "condition",
    "anatomical_region_fine": "anatomical_region_detailed",
    "orig_celltype_ann": "original_celltype_ann",
    "bmi": "BMI",
    "subject_id": "subject_ID",
}

In [6]:
obs_to_keep = [
    "sample",
    "condition",
    "study",
    "dataset",
    "original_celltype_ann",
    "sample",
    "age",
    "sex",
    "disease",
    "subject_ID",
    "sample_type",
    "ethnicity",
    "BMI",
    "smoking_status",
    "anatomical_region_coarse",
    "anatomical_region_detailed",
    "anatomical_region_level_1",
    "genome",
    "single_cell_platform",
]

In [7]:
verbose = False
for ds, ad in query_adatas.items():
    if verbose:
        print("dataset:", ds)
    obs_renamer = {obscol: obscol for obscol in ad.obs.columns}
    for old_name, new_name in obs_to_rename.items():
        obs_renamer[old_name] = new_name
    ad.obs.rename(columns=obs_renamer, inplace=True)
    # remove all columns that are not in our to-keep list:
    col_to_remove = [col for col in ad.obs.columns if col not in obs_to_keep]
    if verbose:
        print("columns to remove:", col_to_remove)
    ad.obs.drop(columns=col_to_remove, inplace=True)
    # remove all var columns:
    ad.var.drop(columns=ad.var.columns, inplace=True)
    # store:
    query_adatas[ds] = ad

add sample type info, single cell protocol info, and single-cell vs single nucleus info. We will later add the information to datasets that are mixed in these respects.

In [8]:
sample_type_info = {
    "adams": "MIXED",  # normal: donor lung, diseased: lung explant,
    "barbry": "MIXED_DONE",  # already annotated in object
    "bharat": "autopsy",
    "carraro_cff": "lung_explant",  # explant
    "carraro_csmc": "lung_explant",  # explant
    "carraro_ucla": "lung_explant",  # explant
    "delorey_cryo": "autopsy_cryopreserved",
    "delorey_fresh": "autopsy",
    "delorey_nuclei": "autopsy",
    "duong": "lung_explant",  # explant
    "grant_cryo": "balf_cryopreserved",
    "grant_fresh": "balf",
    "guo": "lung_explant",  # explant
    "haberman": "lung_explant",  # explant
    "lambrechts": "surgical_resection",
    "laughney": "surgical_resection",
    "liao": "balf",
    "lukassen": "surgical_resection",
    "mayr": "MIXED",  # healthy: surgical resection, ILD: lung explant
    "meyer": "donor_lung",
    "mould": "balf",
    "ordovasmontanes": "scraping",
    "reyfman_disease": "MIXED",  # explanted lung and cryobiopsy
    "schiller_discovair": "surgical_resection",
    "szabo": "donor_lung",
    "tsukui": "donor_lung",  # rejected donors
    "valenzi": "lung_explant",  # explant
    "wang_sub_batch1": "biopsy",
    "wang_sub_batch2": "biopsy",
    "wang_sub_batch3": "biopsy",
    "wang_sub_batch4": "biopsy",
    "wouters": "balf",
}

In [9]:
cell_vs_nucleus = {
    "adams": "cells",
    "barbry": "cells",
    "bharat": "cells",
    "carraro_cff": "cells",
    "carraro_csmc": "cells",
    "carraro_ucla": "cells",
    "delorey_cryo": "cells",
    "delorey_fresh": "cells",
    "delorey_nuclei": "nuclei",
    "duong": "nuclei",
    "grant_cryo": "cells",
    "grant_fresh": "cells",
    "guo": "cells",
    "haberman": "cells",
    "lambrechts": "cells",
    "laughney": "cells",
    "liao": "cells",
    "lukassen": "nuclei",
    "mayr": "cells",
    "meyer": "cells",
    "mould": "cells",
    "ordovasmontanes": "cells",
    "reyfman_disease": "cells",
    "schiller_discovair": "cells",
    "szabo": "cells",
    "tsukui": "cells",
    "valenzi": "cells",
    "wang_sub_batch1": "nuclei",
    "wang_sub_batch2": "nuclei",
    "wang_sub_batch3": "nuclei",
    "wang_sub_batch4": "nuclei",
    "wouters": "cells",
}

TO DO for revisions: complete this information (see dataset supplementary table)

In [10]:
protocols = {
    "adams": "10X v2",
    "barbry": "10X v3 3'",
    "bharat": "10X v3",
    "carraro_cff": "10X",
    "carraro_csmc": "10X",
    "carraro_ucla": "Drop-Seq",
    "delorey_cryo": "10X v1.1 5'",
    "delorey_fresh": "10X v1.1 5'",
    "delorey_nuclei": "10X v3 3'",
    "duong": "10X v3",
    "grant_cryo": "10X v1 5'",
    "grant_fresh": "10X v1 5'",
    "guo": "10X v2",
    "haberman": "10X",
    "lambrechts": "10X",  # mixed
    "laughney": "10X v2",
    "liao": "10X",  # mixed v2 and v3
    "lukassen": "10X v2",
    "mayr": "Drop-Seq",
    "meyer": "MIXED_DONE",
    "mould": "10X v2",
    "ordovasmontanes": "Seq-Well",
    "reyfman_disease": "10X v2 3'",
    "schiller_discovair": "10X v3 3'",
    "szabo": "10X 3'",
    "tsukui": "10X v2 3'",
    "valenzi": "10X v2",
    "wang_sub_batch1": "10X v3",
    "wang_sub_batch2": "10X v3",
    "wang_sub_batch3": "10X v3",
    "wang_sub_batch4": "10X v3",
    "wouters": "10X 5'",
}

In [11]:
for ds_name, ds in query_adatas.items():
    sample_type = sample_type_info[ds_name]
    cells_or_nuclei = cell_vs_nucleus[ds_name]
    protocol = protocols[ds_name]
    if not sample_type.startswith("MIXED"):
        ds.obs["sample_type"] = sample_type
    else:
        print("Add sample type info later for", ds_name)
    if not cells_or_nuclei == "unknown":
        ds.obs["cells_or_nuclei"] = cells_or_nuclei
    else:
        print("cells or nuclei unknown for", ds_name)
        ds.obs["cells_or_nuclei"] = "unknown"
    if protocol == "unknown":
        print("single cell protocol unknown for", ds_name)
        ds.obs["single_cell_platform"] = "unknown"
    elif protocol.startswith("MIXED"):
        print("mixed single cell for", ds_name)
    else:
        ds.obs["single_cell_platform"] = protocol

    query_adatas[ds_name] = ds

Add sample type info later for adams
Add sample type info later for barbry
cells or nuclei unkown for barbry
single cell protocol unkown for barbry
Add sample type info later for mayr
mixed single cell for meyer
Add sample type info later for reyfman_disease


Add dataset specific info/corrections:

Cancer type for lambrechts data:

In [12]:
lambrechts_subject_to_condition = {
    "lambrechts_1": "Squamous Cell Carcinoma",
    "lambrechts_2": "Squamous Cell Carcinoma",
    "lambrechts_3": "Adenocarcinoma",
    "lambrechts_4": "Adenocarcinoma",
    "lambrechts_5": "Large Cell Carcinoma",
    "lambrechts_6": "Adenocarcinoma",
    "lambrechts_7": "Squamous Cell Carcinoma",
    "lambrechts_8": "Pleiomorphic Carcinoma",
}

In [13]:
verbose = False
for ds_name, ds in query_adatas.items():
    if verbose:
        print(ds_name)
    # add sample type info to studies with mixed sample types:
    if ds_name == "adams":
        adams_cond_to_st = {
            "Control": "donor_lung",
            "IPF": "lung_explant",
            "COPD": "lung_explant",
        }
        ds.obs["sample_type"] = ds.obs.condition.map(adams_cond_to_st)
    if ds_name == "mayr":
        mayr_cond_to_st = {
            "control donor": "surgical_resection",
            "endstage lung fibrosis": "lung_explant",
        }
        ds.obs["sample_type"] = ds.obs.condition.map(mayr_cond_to_st)
    if ds_name == "reyfman_disease":
        reyfman_cond_to_st = {
            cond: "lung_explant" for cond in ds.obs.condition.unique()
        }
        reyfman_cond_to_st["Cryobiopsy"] = "biopsy_cryopreserved"
        ds.obs["sample_type"] = ds.obs.condition.map(reyfman_cond_to_st)
    # remove duplicated cells in mould/janssen dataset (preprocessing mistake)
    if ds_name == "mould":
        ds = ds[~pd.isnull(ds.obs.subject_ID), :].copy()
    # add dataset and study info for duong
    if ds_name == "duong":
        ds.obs["dataset"] = "Duong_lungMAP_unpubl"  # TO DO WAIT FOR MALTE'S RESPONSE
        ds.obs["study"] = "Duong_lungMAP_unpubl"
    if ds_name == "meyer":
        ds.obs["condition"] = "Healthy"  # healthy lung from donors
    # remove duplicated condition column for tsukui data
    if ds_name == "tsukui":
        ds.obs = ds.obs.loc[:, ~ds.obs.columns.duplicated()].copy()
    # correct condition for guo (now only set to "disease"):
    if ds_name == "guo":
        ds.obs["condition"] = "Lymphangioleiomyomatosis"
    # add condition healthy to wang datasets:
    if ds_name.startswith("wang"):
        ds.obs["condition"] = "Healthy"
    # add cancer type info for lambrechts data:
    if ds_name == "lambrechts":
        ds.obs["condition"] = ds.obs.subject_ID.map(lambrechts_subject_to_condition)
    # make sample names unique (not just numbers) for the following datasets:
    if ds_name in [
        "barbry",
        "duong",
        "grant_cryo",
        "grant_fresh",
        "laughney",
        "reyfman",
    ]:
        study = ds.obs.study.unique()[0]
        # check if not already corrected:
        if not str(ds.obs["sample"][0]).startswith(study):
            ds.obs["sample"] = [
                f"{study}_sample_{s_number}" for s_number in ds.obs["sample"]
            ]
    # make subject names unique (not just numbers) for the following datasets:
    if ds_name in ["ordovasmontanes", "grant_cryo", "grant_fresh"]:
        study = ds.obs.study.unique()[0]
        if not str(ds.obs["subject_ID"][0]).startswith(study):
            ds.obs["subject_ID"] = [
                f"{study}_subject_{s_number}" for s_number in ds.obs["subject_ID"]
            ]
    # remove spaces from subject names:
    ds.obs["subject_ID"] = [sid.replace(" ", "_") for sid in ds.obs["subject_ID"]]

    # STORE RESULT:
    query_adatas[ds_name] = ds

### Merge all datasets into one object:

merge datasets into one object:

In [14]:
adata = sc.AnnData.concatenate(
    *query_adatas.values(),
    join="outer",
    batch_key=None,
    batch_categories=list(query_adatas.keys()),
    index_unique="_"
)
print(adata.shape)

(1647652, 2000)


In [15]:
adata

AnnData object with n_obs × n_vars = 1647652 × 2000
    obs: 'dataset', 'study', 'original_celltype_ann', 'condition', 'subject_ID', 'sample', 'cells_or_nuclei', 'single_cell_platform', 'sample_type', 'age', 'sex', 'ethnicity', 'BMI', 'smoking_status', 'anatomical_region_level_1', 'anatomical_region_coarse', 'anatomical_region_detailed', 'genome', 'disease'

### Further metadata cleaning:

renaming of conditions, studies, datasets, sex, ethnicity:

In [16]:
condition_renaming = {
    "CO": "Control", # should I change this to healthy? 
    "healthy": "Healthy",
    "TUMOR": "Lung adenocarcinoma",
    "NOR": "Tumor adjacent normal",
    "control": "Control",
    "moderate COVID-19": "COVID-19 moderate",
    "severe COVID-19": "COVID-19 severe",
    "smoking": "Healthy",
    "EAA": "HP",
    "control donor": "Control",
    "endstage lung fibrosis": "Endstage lung fibrosis",
    "chronic rhinitis": "Chronic rhinitis",
    "respiratory system disease": "Respiratory system disease",
    "SSc-ILD": "Systematic sclerosis-associated ILD",
    "SSc": "Systematic sclerosis-associated ILD",
    "Myositis-ILD": "Myositis-associated ILD",
    "Cryobiopsy": "IPF",  # this is a single sample from Reyfman et al.
    "stimulated": "anti-CD3 anti-CD28 antibody stimulated",
    "SCD": "Scleroderma",
    "NML": "Healthy",
    "non-COVID pneumonia": "Pneumonia non-COVID",
}  # IPF, Control, COPD, CF, ILD, Sarcoidosis, NSIP, cHP

In [17]:
study_renamer = {
    "BarbyUnpublished": "Barbry_unpubl",
    "Kaminski2020": "Kaminski_2020",
    "Budinger2021": "Budinger_2020",
    "Gomperts2021": "Gomperts_2021",
    "regev2021": "Regev_2021",
    "Wunderink2021": "Wunderink_2021",
    "Guo2020": "Xu_2020",
    "Banovich2020": "Banovich_Kropski_2020",
    "PeerMassague2020": "Peer_Massague_2020",
    "Zhang2021": "Zhang_2021",
    "Eils2020": "Eils_2020",
    "Schiller2020": "Schiller_2020",
    "Janssen2020": "Janssen_2020",
    "Shalek2018": "Shalek_2018",
    "MisharinBudinger2019": "Misharin_Budinger_2018",
    "Schiller2021": "Schiller_2021",
    "Sims2019": "Sims_2019",
    "Sheppard2020": "Sheppard_2020",
    "Lafyatis2019": "Lafyatis_2019",
    "Sun2020": "Sun_2020",
    "Lambrechts2021": "Lambrechts_2021",
}

In [18]:
dataset_renamer = {
    "BarbyUnpublished": "Barbry_unpubl",
    "Kaminski2020": "Kaminski_2020",
    "Budinger2021": "Budinger_2020",
    "Gomperts2021_CFF": "Gomperts_2021_CFF",
    "Gomperts2021_CSMC": "Gomperts_2021_CSMC",
    "Wunderink2021_cryo": "Wunderink_2021_cryo",
    "Wunderink2021_fresh": "Wunderink_2021_fresh",
    "Guo2020_LAM1/3": "Xu_2020_LAM1_3",
    "Banovich2020": "Banovich_Kropski_2020",
    "PeerMassague2020": "Peer_Massague_2020",
    "Zhang2021": "Zhang_2021",
    "Eils2020": "Eils_2020",
    "Schiller2020": "Schiller_2020",
    "Janssen2020": "Janssen_2020",
    "Shalek2018": "Shalek_2018",
    "MisharinBudinger2019_disease": "Misharin_Budinger_2018",
    "Schiller2021": "Schiller_2021",
    "Sims2019": "Sims_2019",
    "Sheppard2020": "Sheppard_2020",
    "Lafyatis2019": "Lafyatis_2019",
    "Sun2020_batch1": "Sun_2020_batch1",
    "Sun2020_batch2": "Sun_2020_batch2",
    "Sun2020_batch3": "Sun_2020_batch3",
    "Sun2020_batch4": "Sun_2020_batch4",
    "Lambrechts2021": "Lambrechts_2021",
    "KULeuven_Thienpont_2018Lambrechts_v1" = "Thienpont_2018_10Xv1",
    "KULeuven_Thienpont_2018Lambrechts_v2" = "Thienpont_2018_10Xv2"
    
}

In [None]:
sex_renamer = {
    'F': 'female',
    'M': 'male',
    'male': 'male',
    'Female': 'female',
    'Male': 'male',
    'female': 'female'
}

In [None]:
ethn_renamer = {
    "Latina":"latino",
    "None":np.nan,
    "nan":np.nan
}

In [None]:
age_renamer = {"37yo": "37", "nan": np.nan}

Now include all values that stay the same during the remapping:

In [19]:
condition_renaming_full = {cond: cond for cond in adata.obs.condition.unique()}
for cond_old, cond_new in condition_renaming.items():
    condition_renaming_full[cond_old] = cond_new
study_renaming_full = {st: st for st in adata.obs.study.unique()}
for st_old, st_new in study_renamer.items():
    study_renaming_full[st_old] = st_new
dataset_renaming_full = {ds: ds for ds in adata.obs.dataset.unique()}
for ds_old, ds_new in dataset_renamer.items():
    dataset_renaming_full[ds_old] = ds_new
sex_renaming_full = {sex:sex for sex in adata.obs.sex.unique()}
for sex_old, sex_new in sex_renamer.items():
    sex_renaming_full[sex_old] = sex_new
ethn_renaming_full = {ethn:ethn for ethn in adata.obs.ethnicity.unique()}
for ethn_old, ethn_new in ethn_renamer.items():
    ethn_renaming_full[ethn_old] = ethn_new
age_renaming_full = {age:age for age in adata.obs.age.unique()}
for age_old, age_new in age_renamer.items():
    age_renaming_full[age_old] = age_new

now map annotations:

In [20]:
adata.obs["condition"] = adata.obs.condition.map(condition_renaming_full)
adata.obs["study"] = adata.obs.study.map(study_renaming_full)
adata.obs["dataset"] = adata.obs.dataset.map(dataset_renaming_full)
adata.obs["sex"] = adata.obs.sex.map(sex_renaming_full)
adata.obs["ethnicity"] = adata.obs.ethnicity.map(ethn_renaming_full)
adata.obs["age"] = adata.obs.age.map(age_renaming_full)


convert age to float:

In [None]:
adata.obs.age = [np.float(age) for age in adata.obs.age]

### Sanity checks:

check if samples are unique to one subject, dataset and study:

In [21]:
data_per_sample = adata.obs.groupby("sample").agg(
    {"dataset": "nunique", "subject_ID": "nunique", "study": "nunique"}
)

this should all be 0:

In [22]:
(data_per_sample > 1).sum(axis=0)

dataset       0
subject_ID    0
study         0
dtype: int64

same for subjects:

In [23]:
data_per_subject = adata.obs.groupby("subject_ID").agg(
    {"dataset": "nunique", "study": "nunique"}
)

3 subjects are present in two regev datasets from the same study, so this should be 3 for dataset and 0 for study

In [24]:
(data_per_subject > 1).sum(axis=0)

dataset    3
study      0
dtype: int64

check if all obs are fully annotated:

In [25]:
for col in adata.obs.columns:
    print(col)
    print("Fully annotated?", ~pd.isnull(adata.obs[col]).any())
    print("categories:", adata.obs[col].unique())
    print("\n\n")

dataset
Fully annotated? True
categories: ['Kaminski_2020' 'Barbry_unpubl' 'Budinger_2021' 'Gomperts_2021_CFF'
 'Gomperts_2021_CSMC' 'Gomperts2021_UCLA' 'Regev_2021_Cryo'
 'Regev_2021_Fresh' 'Regev_2021_Nuclei' 'Duong_lungMAP_unpubl'
 'Wunderink_2021_cryo' 'Wunderink_2021_fresh' 'Guo_2020_LAM1_3'
 'Banovich_Kropski_2020' 'KULeuven_Thienpont_2018Lambrechts_v1'
 'KULeuven_Thienpont_2018Lambrechts_v2' 'Peer_Massague_2020' 'Zhang_2021'
 'Eils_2020' 'Schiller_2020' 'Meyer_2021_5prime' 'Meyer_2021_3prime'
 'Janssen_2020' 'Shalek_2018' 'Misharin_Budinger_2018' 'Schiller_2021'
 'Sims_2019' 'Sheppard_2020' 'Lafyatis_Rojas_2019_disease'
 'Sun_2020_batch1' 'Sun_2020_batch2' 'Sun_2020_batch3' 'Sun_2020_batch4'
 'Lambrechts_2021']



study
Fully annotated? True
categories: ['Kaminski_2020' 'Barbry_unpubl' 'Budinger_2021' 'Gomperts_2021'
 'Regev_2021' 'Duong_lungMAP_unpubl' 'Wunderink_2021' 'Guo_2020'
 'Banovich_Kropski_2020' 'Thienpont_2018' 'Peer_Massague_2020'
 'Zhang_2021' 'Eils_2020' 'Schiller_

## Store result:

In [29]:
adata.write(path_output_atlas_extension)