# Data import for Meyer_2021 dataset:

In [1]:
import scanpy as sc
import sys
import pandas as pd
sys.path.append("../../../scripts/")
import LCA_file_reading 
import reference_based_harmonizing
import preprocessing

In [2]:
data_dir = ".." # set to storage dir of LCA core datasets
project_dir = f"{data_dir}Meyer_2021/"

In [3]:
adata = LCA_file_reading.read_file_Meyer_2021(project_dir)

  if (await self.run_code(code, result,  async_=asy)):


cells from samples found in metadata: 129340
of those, n cells found in anndata (count matrices): 128628
Note that some cells can be lost here due to different filtering settings during re-alignment.


In [4]:
adata

AnnData object with n_obs × n_vars = 128628 × 33694
    obs: 'sample', 'original_celltype_ann', 'study_long', 'study', 'last_author_PI'
    var: 'gene_symbols'

add cell type reference annotations:

In [5]:
harmonizing_df = reference_based_harmonizing.load_harmonizing_table(
    "../../../supporting_files/metadata_harmonization/HLCA_cell_type_reference_mapping_20211103.csv"
)
consensus_df = reference_based_harmonizing.create_consensus_table(harmonizing_df)
celltype_translation_df = (
    reference_based_harmonizing.create_orig_ann_to_consensus_translation_df(
        adata, consensus_df, harmonizing_df, verbose=False
    )
)
adata = reference_based_harmonizing.consensus_annotate_anndata(
    adata, celltype_translation_df, verbose=True
)
# remove cells that are annotated as "Unicorns and artifacts"
adata = adata[adata.obs.ann_level_1 != 'Unicorns and artifacts',:].copy()
# add "clean" annotation without forward-propagated labels 
adata = reference_based_harmonizing.add_clean_annotation(adata)

In [6]:
adata.shape

(128628, 33694)

add sample metadata:

In [8]:
metadata = preprocessing.get_sample_annotation_table_LCA("../")

LCA_metadata_Banovich_Kropski.csv
LCA_metadata_Seibold.csv
LCA_metadata_Krasnow.csv
LCA_metadata_Meyer.csv
LCA_metadata_Barbry.csv
LCA_metadata_Lafyatis.csv
LCA_metadata_Nawijn.csv
LCA_metadata_Misharin.csv
number of rows without rowname/sample name (will be removed): 14
Sample IDs unique? False
Number of samples without donor ID: 0


In [9]:
# subset to sample from this dataset:
metadata = metadata[metadata.library_ID.isin(adata.obs['sample'].unique())]
# check if each sample has a row:
print("Number of samples equal to number of rows in metadata?", metadata.shape[0] == len(adata.obs['sample'].unique()))
# remove columns not of interest:
metadata_columns_to_drop = [
    "IF_AVAILABLE/_APPLICABLE_-->",
    "Institute",
    "Study_PI",
    "publication_ID",
    "repository_ID",
    "library-construction_batch",
    "year_of_sample_collection",
    "relative_sample_collection_timepoint",
    "treatment_status",
    "number_of_cells_loaded",
]
metadata.drop(columns=metadata_columns_to_drop, inplace=True)

Number of samples equal to number of rows in metadata? True


In [10]:
# check if there are any samples that have a known lung disease but no condition listed
for row in metadata.loc[metadata.known_lung_disease == "yes", :].index:
    matching_condition = metadata.loc[row, "condition"]
    if pd.isnull(matching_condition) or matching_condition == "nan":
        print(row, metadata.loc[row, "condition"])
# check which lung conditions are in the data:
lung_conditions = [x for x in (set(metadata.condition)) if not pd.isnull(x)]
if len(lung_conditions) == 0:
    print("No lung conditions for these subjects.")

No lung conditions for these subjects.


In [11]:
# set adata.obs['sample'] to library_ID, since that is what they are:
adata.obs['library_ID'] = adata.obs['sample']
# now map sample names:
library_ID_to_sample_dict = dict(zip(metadata.library_ID, metadata.index))
adata.obs['sample'] = adata.obs.library_ID.map(library_ID_to_sample_dict)
for cat in metadata.columns:
    sample_to_cat_dict = dict(zip(metadata.library_ID, metadata[cat]))
    adata.obs[cat] = adata.obs.library_ID.map(sample_to_cat_dict)
# and drop library ID
adata.obs.drop(columns='library_ID',inplace=True)

split data into two datasets, 5' and 3' data:

In [12]:
adata.obs['dataset'] = [f"{study}_{prime[0]}prime" for study, prime in zip(adata.obs.study,adata.obs["3'_or_5'"])]

store result:

In [14]:
adata.write("../LCA_h5ads/Meyer_2021_raw.h5ad")

... storing 'original_celltype_ann' as categorical
... storing 'study_long' as categorical
... storing 'study' as categorical
... storing 'last_author_PI' as categorical
... storing 'ann_level_1' as categorical
... storing 'ann_level_2' as categorical
... storing 'ann_level_3' as categorical
... storing 'ann_level_4' as categorical
... storing 'ann_level_5' as categorical
... storing 'ann_level_1_clean' as categorical
... storing 'ann_level_2_clean' as categorical
... storing 'ann_level_3_clean' as categorical
... storing 'ann_level_4_clean' as categorical
... storing 'ann_level_5_clean' as categorical
... storing 'subject_ID' as categorical
... storing 'subject_ID_as_published' as categorical
... storing 'pre_or_postnatal' as categorical
... storing 'sex' as categorical
... storing 'ethnicity' as categorical
... storing 'smoking_status' as categorical
... storing 'smoking_history' as categorical
... storing 'known_lung_disease' as categorical
... storing 'subject_type' as categorical


#### subset to 2000 hvgs used for integration with HLCA:

In [19]:
genes_to_keep = pd.read_csv("../query_datasets/genes_for_mapping.csv",index_col=0)

subset based on ensembl ids 

In [34]:
adata_subset = adata[:,genes_to_keep.index].copy()

store ensembl ids in an adata.var column:

In [35]:
adata_subset.var['gene_ids'] = adata_subset.var.index.tolist()

check if gene symbols also match with genes to keep (all 2000 should match):

In [36]:
adata_subset.var.gene_symbols.isin(genes_to_keep.gene_symbols).sum()

2000

set index to gene symbols:

In [37]:
adata_subset.var.index = adata_subset.var.gene_symbols

In [42]:
adata_subset.var.index.name = None

In [43]:
adata_subset.var.drop(columns=["gene_symbols"], inplace=True)

In [44]:
adata_subset.var.head()

Unnamed: 0,gene_ids
FGR,ENSG00000000938
CFH,ENSG00000000971
HS3ST1,ENSG00000002587
TMEM176A,ENSG00000002933
TFPI,ENSG00000003436


Store:

In [48]:
adata_subset.write("../../../data/HLCA_extended/extension_datasets/ready/subsetted/meyer_sub.h5ad")