# GENERATION OF SEVERAL SUPPLEMENTARY TABLES

load modules:

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import sys
import os

sys.path.append("../scripts/")
import reference_based_harmonizing

In [2]:
%load_ext lab_black

set paths:

In [3]:
path_hlca_core = "../data/HLCA_core_h5ads/HLCA_v2.h5ad"
path_hlca_ext_emb = (
    "../data/HLCA_extended/HLCA_extended/HLCA_extended_scarches_emb.h5ad"
)
path_manual_anns_in = "../supporting_files/celltype_structure_and_colors/manual_anns_and_leveled_anns_ordered.csv"
dir_supp_tables_out = "../results/suppl_tables/"

load data:

In [4]:
core = sc.read_h5ad(path_hlca_core)

In [5]:
ext = sc.read_h5ad(path_hlca_ext_emb)

## DATASET TABLE: 

creates part of the dataset info from supplementary table

In [6]:
def get_set_as_str(x):
    """Function to extract string of the set of values from a dataframe column"""
    x_set = x.unique()
    x_set_str_list = [str(i) for i in x_set]
    x_set_str = ", ".join(x_set_str_list)
    return x_set_str

In [9]:
dataset_overview = ext.obs.groupby("study").agg(
    n_datasets=("dataset", "nunique"),
    dataset_names=("dataset", get_set_as_str),
    core_or_extension=("core_or_extension", get_set_as_str),
    lung_condition=("condition", get_set_as_str),
    n_subjects=("subject_ID", "nunique"),
    n_samples=("sample", "nunique"),
    n_cells=("study", "count"),
    cells_or_nuclei=("cells_or_nuclei", get_set_as_str),
    single_cell_platform=("single_cell_platform", get_set_as_str),
    tissue_sampling_type=("sample_type", get_set_as_str),
)

In [10]:
dataset_overview["in_core"] = [
    "core" in c_or_e for c_or_e in dataset_overview.core_or_extension
]

In [11]:
dataset_overview.sort_values(
    by=["in_core", "study"], ascending=[False, True], inplace=True
)

## SAMPLE TABLE

In [236]:
agg_vars = [
    "study",
    "dataset",
    "subject_ID",
    "age",
    "sex",
    "ethnicity",
    "mixed_ethnicity",
    "smoking_status",
    "BMI",
    "condition",
    "subject_type",
    "cause_of_death",
    "sample_type",
    "single_cell_platform",
    "3'_or_5'",
    "sequencing_platform",
    "cell_ranger_version",
    "fresh_or_frozen",
    "anatomical_region_level_1",
    "anatomical_region_level_2",
    "anatomical_region_level_3",
    "cells_or_nuclei",
]

sanity check: check if any sample has multiple values for any of these covariates (should not be the case):

In [237]:
agg_style = "nunique"
sample_unique_covs = core.obs.groupby("sample").agg(
    {var: agg_style for var in agg_vars}
)

This should be 0:

In [238]:
(sample_unique_covs > 1).sum().sum()

0

Now aggregate actual values per sample:

In [270]:
agg_style = "first"
core_cov_values = core.obs.groupby("sample").agg({var: agg_style for var in agg_vars})

for anatomical region, remove forward-propagated labels:

In [271]:
for lev in range(1, 4):
    cov = f"anatomical_region_level_{lev}"
    mapping = {loc: loc for loc in core_cov_values[cov].unique()}
    for key, value in mapping.items():
        if key[:2] in ["1_", "2_"]:
            mapping[key] = np.nan
    core_cov_values[cov] = core_cov_values[cov].map(mapping)

set condition "nan" to "healthy" (these are all healthy lungs unless specified otherwise):

In [272]:
core_cov_values.loc[core_cov_values.condition == "nan", "condition"] = "healthy"

In [273]:
core_cov_values.rename(columns={"condition": "lung_condition"}, inplace=True)

put "v" before cell ranger version, to prevent conversion to dates in excel:

In [274]:
cell_ranger_cleaner = {
    v: f"v.{v}" for v in core_cov_values.cell_ranger_version.unique()
}
cell_ranger_cleaner["nan"] = "nan"
core_cov_values.cell_ranger_version = core_cov_values.cell_ranger_version.map(
    cell_ranger_cleaner
)

In [275]:
core_cov_values.lung_condition.unique()

array(['had TB as a child (fully treated over 30+ years)', 'healthy',
       'worsening respiratory function prior to arrest',
       'non-small cell lung cancer', 'carcinoid'], dtype=object)

In [276]:
ext

AnnData object with n_obs × n_vars = 2232536 × 30
    obs: 'dataset', 'study', 'manual_ann', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'condition', 'single_cell_platform', 'disease', 'cells_or_nuclei', 'HLCA_or_query', 'subject_ID', 'sample', 'sample_type', 'age', 'sex', 'ethnicity', 'BMI', 'smoking_status', 'anatomical_region_level_1', 'anatomical_region_level_2', 'anatomical_region_level_3', 'original_celltype_ann', 'original_ann_level_1', 'original_ann_level_2', 'original_ann_level_3', 'original_ann_level_4', 'original_ann_level_5', 'original_ann_level_1_clean', 'original_ann_level_2_clean', 'original_ann_level_3_clean', 'original_ann_level_4_clean', 'original_ann_level_5_clean', 'manual_ann_grouped'
    uns: 'HLCA_or_query_colors', 'cells_or_nuclei_colors', 'condition_colors', 'dataset_colors', 'disease_colors', 'manual_ann_colors', 'neighbors', 'original_ann_level_1_colors', 'original_ann_level_3_clean_colors', 'single_cell_platform_colors', 'study

In [277]:
covs_ext = [
    "dataset",
    "study",
    "subject_ID",
    "condition",
    "sample_type",
    "age",
    "sex",
    "ethnicity",
    "BMI",
    "smoking_status",
    "anatomical_region_level_1",
    "anatomical_region_level_2",
    "anatomical_region_level_3",
    #     "single_cell_platform", excluding this as it needs to be cleaned up a lot
    "cells_or_nuclei",
    "HLCA_or_query",
]

In [278]:
ext_nunique_covs = ext.obs.groupby("sample").agg({cov: "nunique" for cov in covs_ext})

In [279]:
(ext_nunique_covs > 1).sum().sum()

0

In [280]:
ext_cov_values = ext.obs.groupby("sample").agg({cov: "first" for cov in covs_ext})

drop all HLCA core samples, as we already got more detailed metadata for those above:

In [281]:
ext_cov_values = ext_cov_values.loc[ext_cov_values.HLCA_or_query != "HLCA", :]

In [282]:
ext_cov_values

Unnamed: 0_level_0,dataset,study,subject_ID,condition,sample_type,age,sex,ethnicity,BMI,smoking_status,anatomical_region_level_1,anatomical_region_level_2,anatomical_region_level_3,cells_or_nuclei,HLCA_or_query
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1A1DFBng,Eils_2020,Eils_2020,1A1DFBng,Healthy,surgical_resection,47.0,F,,,active,,,,nuclei,Query
001C,Kaminski_2020,Kaminski_2020,001C,Control,donor_lung,,,,,,,,,cells,Query
002C,Kaminski_2020,Kaminski_2020,002C,Control,donor_lung,,,,,,,,,cells,Query
003C,Kaminski_2020,Kaminski_2020,003C,Control,donor_lung,,,,,,,,,cells,Query
8CO,Kaminski_2020,Kaminski_2020,8CO,COPD,lung_explant,,,,,,,,,cells,Query
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
scrBT1428,Thienpont_2018_10Xv2,Thienpont_2018,lambrechts_7,Squamous Cell Carcinoma,surgical_resection,,,,,,,,,cells,Query
scrBT1429,Thienpont_2018_10Xv2,Thienpont_2018,lambrechts_8,Pleiomorphic Carcinoma,surgical_resection,,,,,,,,,cells,Query
scrBT1430,Thienpont_2018_10Xv2,Thienpont_2018,lambrechts_8,Pleiomorphic Carcinoma,surgical_resection,,,,,,,,,cells,Query
scrBT1431,Thienpont_2018_10Xv2,Thienpont_2018,lambrechts_8,Pleiomorphic Carcinoma,surgical_resection,,,,,,,,,cells,Query


clean up several columns:

In [283]:
condition_cleaner = {con: con for con in ext_cov_values.condition.unique()}

In [284]:
condition_cleaner["healthy"] = "healthy"
condition_cleaner["Control"] = "healthy"

In [285]:
sex_cleaner = {sex: sex for sex in ext_cov_values.sex.unique()}
sex_cleaner["F"] = "female"
sex_cleaner["M"] = "male"
sex_cleaner["Female"] = "female"
sex_cleaner["Male"] = "male"

In [286]:
ethnicity_cleaner = {eth: eth for eth in ext_cov_values.ethnicity.unique()}
ethnicity_cleaner["Latina"] = "latino"
ethnicity_cleaner["None"] = "nan"

In [287]:
ext_cov_values.condition = ext_cov_values.condition.map(condition_cleaner)
ext_cov_values.sex = ext_cov_values.sex.map(sex_cleaner)
ext_cov_values.ethnicity = ext_cov_values.ethnicity.map(ethnicity_cleaner)

In [288]:
ext_cov_values.rename(
    columns={"HLCA_or_query": "HLCA_core_or_extension", "condition": "lung_condition"},
    inplace=True,
)
ext_cov_values["HLCA_core_or_extension"] = "extension"

Concatenate core and extended data:

In [289]:
core_cov_values["HLCA_core_or_extension"] = "core"

convert categoricals to lists:

In [290]:
ext_cov_values.index = ext_cov_values.index.tolist()
core_cov_values.index = core_cov_values.index.tolist()

In [291]:
sample_table = pd.concat((core_cov_values, ext_cov_values), axis=0, join="outer")

In [292]:
columns_reordered = ["HLCA_core_or_extension"] + [
    col for col in sample_table.columns if not col == "HLCA_core_or_extension"
]

In [293]:
sample_table = sample_table.loc[:, columns_reordered]

In [294]:
sample_table.index.name = "sample"

In [295]:
sample_table.sort_values(
    by=["HLCA_core_or_extension", "study", "dataset", "subject_ID", "sample"],
    inplace=True,
)

write:

In [297]:
sample_table.to_csv("../suppl_tables/SuppT_sample_overview.csv")

## Cell type reference mapping:

In [299]:
cell_reference_path = "../LCA_metadata/LCA_cell_type_reference_mapping_20211103.csv"

In [301]:
harmonizing_df = reference_based_harmonizing.load_harmonizing_table(cell_reference_path)
# consensus_df = reference_based_harmonizing.create_consensus_table(harmonizing_df)

remove columns after first "unnamed" column:

In [310]:
last_column = np.where(harmonizing_df.columns.str.startswith("Unnamed"))[0][0]

In [314]:
harmonizing_df = harmonizing_df.iloc[:, :last_column]

In [320]:
harmonizing_df.dropna(axis=0, how="all", inplace=True)
harmonizing_df.dropna(axis=1, how="all", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [322]:
harmonizing_df.to_csv("../suppl_tables/SuppT_celltype_ref_mapping.csv")

## Manual annotations, hierarchy:

In [68]:
manual_ann_hierarchy = pd.read_csv(path_manual_anns_in, index_col=0)

In [69]:
manual_ann_hierarchy.drop(columns=["ordering", "colors"], inplace=True)

In [70]:
manual_ann_hierarchy.columns = [
    f"Level {lev} annotation (% of cells)" for lev in range(1, 6)
]

In [71]:
ct_perc_per_lev = dict()
for lev in range(1, 6):
    ct_perc = round(core.obs[f"ann_level_{lev}"].value_counts() / core.n_obs * 100, 1)
    ct_perc_per_lev[lev] = ct_perc
ct_perc_per_lev["finest"] = round(
    core.obs.manual_ann.value_counts() / core.n_obs * 100, 1
)

In [72]:
ct_perc_per_lev[1]

Epithelial     48.2
Immune         39.2
Endothelial     8.2
Stroma          4.3
Name: ann_level_1, dtype: float64

In [73]:
new_index = list()
for ct in manual_ann_hierarchy.index:
    new_index.append(f"{ct} ({ct_perc_per_lev['finest'][ct]})")
manual_ann_hierarchy.index = new_index

In [74]:
for lev in range(1, 6):
    new_values_uncleaned = [
        f"{ct} ({ct_perc_per_lev[lev][ct]})"
        for ct in manual_ann_hierarchy[f"Level {lev} annotation (% of cells)"]
    ]
    new_values = list()
    for ct in new_values_uncleaned:
        if ct[:2] in ["1_", "2_", "3_", "4_"]:
            new_values.append(np.nan)
        else:
            new_values.append(ct)
    manual_ann_hierarchy[f"Level {lev} annotation (% of cells)"] = new_values

In [75]:
manual_ann_hierarchy.index.name = "Finest annotation (% of cells)"

In [76]:
manual_ann_hierarchy

Unnamed: 0_level_0,Level 1 annotation (% of cells),Level 2 annotation (% of cells),Level 3 annotation (% of cells),Level 4 annotation (% of cells),Level 5 annotation (% of cells)
Finest annotation (% of cells),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Basal resting (6.7),Epithelial (48.2),Airway epithelium (35.4),Basal (14.5),Basal resting (6.7),
Suprabasal (7.0),Epithelial (48.2),Airway epithelium (35.4),Basal (14.5),Suprabasal (7.0),
Hillock-like (0.8),Epithelial (48.2),Airway epithelium (35.4),Basal (14.5),Hillock-like (0.8),
Deuterosomal (0.2),Epithelial (48.2),Airway epithelium (35.4),Multiciliated lineage (7.0),Deuterosomal (0.2),
Multiciliated (nasal) (0.8),Epithelial (48.2),Airway epithelium (35.4),Multiciliated lineage (7.0),Multiciliated (6.9),Multiciliated (nasal) (0.8)
...,...,...,...,...,...
Monocyte-derived Mph (4.8),Immune (39.2),Myeloid (26.6),Macrophages (19.1),Interstitial macrophages (5.6),Monocyte-derived Mph (4.8)
Interstitial Mph perivascular (0.8),Immune (39.2),Myeloid (26.6),Macrophages (19.1),Interstitial macrophages (5.6),Interstitial Mph perivascular (0.8)
Classical monocytes (3.0),Immune (39.2),Myeloid (26.6),Monocytes (4.5),Classical monocytes (3.0),
Non-classical monocytes (1.5),Immune (39.2),Myeloid (26.6),Monocytes (4.5),Non-classical monocytes (1.5),


## Store tables:

In [12]:
dataset_overview.to_csv(
    os.path.join(dir_supp_tables_out, "SuppT_X_dataset_overview.csv")
)

In [77]:
manual_ann_hierarchy.to_csv(os.path.join(dir_supp_tables_out, "SuppT_X_manual_ann.csv"))