# Data import of Eils_2020/lukassen dataset

In [1]:
import sfaira

import numpy as np
import scanpy as sc
import anndata
import pandas as pd
import matplotlib.pyplot as plt
import collections
import glob
from scipy import sparse



Ontology <class 'sfaira.versions.metadata.base.OntologyMondo'> is not a DAG, treat child-parent reasoning with care.
Ontology <class 'sfaira.versions.metadata.base.OntologyUberon'> is not a DAG, treat child-parent reasoning with care.


In [2]:
def subset_and_pad_adata(gene_set, adata):
    """
    This function uses a gene list provided as a Pandas dataframe with gene symbols and
    Ensembl IDs and subsets a larger Anndata object to only the genes in this list. If
    Not all genes are found in the AnnData object, then zero-padding is performed.
    """
    # Example inputs:
    # genes_filename = '/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/genes_for_mapping.csv'
    # data_filename = '/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/ready/adams.h5ad'
    # gene_set = pd.read_csv(genes_filename)
    # adata = sc.read(data_filename)

    # Prep objects
    if 'gene_symbols' in gene_set.columns:
        gene_set.index = gene_set['gene_symbols']

    else:
        raise ValueError('The input gene list was not of the expected type!\n'
                         'Gene symbols and ensembl IDs are expected in column names:\n'
                         '\t`gene_symbols` and `Unnamed: 0`')

    # Subset adata object
    common_genes = [gene for gene in gene_set['gene_symbols'].values if gene in adata.var_names]
    if len(common_genes) == 0:
        print("WARNING: YOU SHOULD PROBABLY SWITCH YOUR ADATA.VAR INDEX COLUMN TO GENE NAMES"
                  " RATHER THAN IDS! No genes were recovered.")
        return

    adata_sub = adata[:,common_genes].copy()

    # Pad object with 0 genes if needed
    if len(common_genes) < len(gene_set):
        diff = len(gene_set) - len(common_genes)
        print(f'not all genes were recovered, filling in 0 counts for {diff} missing genes...')

        # Genes to pad with
        genes_to_add = set(gene_set['gene_symbols'].values).difference(set(adata_sub.var_names))
        new_var = gene_set.loc[genes_to_add]

        if 'Unnamed: 0' in new_var.columns:
            # Assumes the unnamed column are ensembl values
            new_var['ensembl'] = new_var['Unnamed: 0']
            del new_var['Unnamed: 0']

        df_padding = pd.DataFrame(data=np.zeros((adata_sub.shape[0],len(genes_to_add))), index=adata_sub.obs_names, columns=new_var.index)
        adata_padding = sc.AnnData(df_padding, var=new_var)

        # Concatenate object
        adata_sub = anndata.concat([adata_sub, adata_padding], axis=1, join='outer', index_unique=None, merge='unique')

    # Ensure ensembl IDs are available
    adata_sub.var['ensembl'] = gene_set['Unnamed: 0']

    return adata_sub

In [3]:
gene_set = pd.read_csv("genes_for_mapping.csv")
gene_set

Unnamed: 0.1,Unnamed: 0,gene_symbols
0,ENSG00000000938,FGR
1,ENSG00000000971,CFH
2,ENSG00000002587,HS3ST1
3,ENSG00000002933,TMEM176A
4,ENSG00000003436,TFPI
...,...,...
1995,ENSG00000280721,AC133644.2
1996,ENSG00000281103,TRG-AS1
1997,ENSG00000282122,CH17-262H11.1
1998,ENSG00000282988,RP1-34B20.21


# Lukassen

In [31]:
univ = sfaira.data.Universe(
    data_path="../../../../../../datasets/projects/20200101_Various_SfairaDataRepository_leander.dony/raw/",
    meta_path="../../../../../../datasets/projects/20200101_Various_SfairaDataRepository_leander.dony/meta/",
    cache_path="../../../../../../datasets/projects/20200101_Various_SfairaDataRepository_leander.dony/cache/"
)
univ.subset("doi", "10.1101/2020.03.13.991455")
univ.load()

loading human_lung_2020_10xtechnology_lukassen_001_10.15252/embj.20105114
loading human_lung_2020_10xtechnology_lukassen_002_10.15252/embj.20105114


In [32]:
lukassen = univ.datasets[univ.ids[0]].adata.copy()

In [33]:
lukassen.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,sex,age,smoking,packyears,percent.mito,CellType
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
101CV0ng_AAACCTGAGGTTCCTA-1,101CV0ng,2866.0,1448,F,75.0,NonSmoking,0,0.001396,AT2
101CV0ng_AAACCTGCAAGGCTCC-1,101CV0ng,4128.0,1909,F,75.0,NonSmoking,0,0.007752,AT2
101CV0ng_AAACCTGCACGAAATA-1,101CV0ng,2438.0,1634,F,75.0,NonSmoking,0,0.001231,Immuno_Monocytes
101CV0ng_AAACCTGCAGCTATTG-1,101CV0ng,3589.0,1821,F,75.0,NonSmoking,0,0.002508,AT2
101CV0ng_AAACCTGGTGATGCCC-1,101CV0ng,2556.0,1656,F,75.0,NonSmoking,0,0.001565,Immuno_Monocytes
...,...,...,...,...,...,...,...,...,...
ZM3KACng_TTTGTCACAAATACAG-1,ZM3KACng,6058.0,1793,F,79.0,NonSmoking,0,0.010399,Club
ZM3KACng_TTTGTCACAGAGTGTG-1,ZM3KACng,15761.0,4547,F,79.0,NonSmoking,0,0.023793,Immuno_Monocytes
ZM3KACng_TTTGTCACAGTTTACG-1,ZM3KACng,7096.0,3414,F,79.0,NonSmoking,0,0.007046,Immuno_TCells
ZM3KACng_TTTGTCACATCCGCGA-1,ZM3KACng,3213.0,1215,F,79.0,NonSmoking,0,0.000934,AT2


In [34]:
l1dict = {"orig.ident": "sample", "nCount_RNA": "rm1", "nFeature_RNA": "rm2", "percent.mito": "rm3", "CellType": "original_celltype_ann", "smoking": "condition", "packyears": "smoking_history"}
lukassen.obs = lukassen.obs.rename(columns=l1dict)
for col in lukassen.obs.columns:
    if col.startswith("rm"):
        del lukassen.obs[col]

In [35]:
lukassen.obs["subject_ID"] = lukassen.obs["sample"]
lukassen.obs["study"] = "Eils2020"
lukassen.obs["dataset"] = lukassen.obs["study"]
lukassen.obs["condition"] = lukassen.obs["condition"].cat.rename_categories(['healthy', 'smoking'])
lukassen.obs["smoking_status"] = [{'healthy': "never", 'smoking': "active"}[i] for i in lukassen.obs["condition"]]

In [36]:
del lukassen.obsm["X_pca"], lukassen.obsm["X_umap"]

In [37]:
lukassen

AnnData object with n_obs × n_vars = 39778 × 32738
    obs: 'sample', 'sex', 'age', 'condition', 'smoking_history', 'original_celltype_ann', 'subject_ID', 'study', 'dataset', 'smoking_status'
    var: 'name'

In [38]:
lukassen.obs

Unnamed: 0_level_0,sample,sex,age,condition,smoking_history,original_celltype_ann,subject_ID,study,dataset,smoking_status
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
101CV0ng_AAACCTGAGGTTCCTA-1,101CV0ng,F,75.0,healthy,0,AT2,101CV0ng,Eils2020,Eils2020,never
101CV0ng_AAACCTGCAAGGCTCC-1,101CV0ng,F,75.0,healthy,0,AT2,101CV0ng,Eils2020,Eils2020,never
101CV0ng_AAACCTGCACGAAATA-1,101CV0ng,F,75.0,healthy,0,Immuno_Monocytes,101CV0ng,Eils2020,Eils2020,never
101CV0ng_AAACCTGCAGCTATTG-1,101CV0ng,F,75.0,healthy,0,AT2,101CV0ng,Eils2020,Eils2020,never
101CV0ng_AAACCTGGTGATGCCC-1,101CV0ng,F,75.0,healthy,0,Immuno_Monocytes,101CV0ng,Eils2020,Eils2020,never
...,...,...,...,...,...,...,...,...,...,...
ZM3KACng_TTTGTCACAAATACAG-1,ZM3KACng,F,79.0,healthy,0,Club,ZM3KACng,Eils2020,Eils2020,never
ZM3KACng_TTTGTCACAGAGTGTG-1,ZM3KACng,F,79.0,healthy,0,Immuno_Monocytes,ZM3KACng,Eils2020,Eils2020,never
ZM3KACng_TTTGTCACAGTTTACG-1,ZM3KACng,F,79.0,healthy,0,Immuno_TCells,ZM3KACng,Eils2020,Eils2020,never
ZM3KACng_TTTGTCACATCCGCGA-1,ZM3KACng,F,79.0,healthy,0,AT2,ZM3KACng,Eils2020,Eils2020,never


In [39]:
lukassen.var

Unnamed: 0_level_0,name
index,Unnamed: 1_level_1
MIR1302-10,MIR1302-10
FAM138A,FAM138A
OR4F5,OR4F5
RP11-34P13.7,RP11-34P13.7
RP11-34P13.8,RP11-34P13.8
...,...
AC145205.1,AC145205.1
BAGE5,BAGE5
CU459201.1,CU459201.1
AC002321.2,AC002321.2


In [40]:
lukassen.write("/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/query_datasets/ready/full/lukassen.h5ad")

... storing 'study' as categorical
... storing 'dataset' as categorical
... storing 'smoking_status' as categorical


In [41]:
lukassen_sub = subset_and_pad_adata(gene_set, lukassen)
lukassen_sub

not all genes were recovered, filling in 0 counts for 172 missing genes...


AnnData object with n_obs × n_vars = 39778 × 2000
    obs: 'sample', 'sex', 'age', 'condition', 'smoking_history', 'original_celltype_ann', 'subject_ID', 'study', 'dataset', 'smoking_status'
    var: 'name', 'gene_symbols', 'ensembl'

In [42]:
lukassen_sub.write("/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/query_datasets/ready/subsetted/lukassen_sub.h5ad")

... storing 'name' as categorical
... storing 'gene_symbols' as categorical


# Szabo

In [16]:
univ = sfaira.data.Universe(
    data_path="../../../../../../datasets/projects/20200101_Various_SfairaDataRepository_leander.dony/raw/",
    meta_path="../../../../../../datasets/projects/20200101_Various_SfairaDataRepository_leander.dony/meta/",
    cache_path="../../../../../../datasets/projects/20200101_Various_SfairaDataRepository_leander.dony/cache/"
)
univ.subset("doi", "10.1038/s41467-019-12464-3")
univ.subset("organ", "lung")
univ.load()

loading human_lung_2019_10xtechnology_szabo_001_10.1038/s41467-019-12464-3
loading human_lung_2019_10xtechnology_szabo_002_10.1038/s41467-019-12464-3
loading human_lung_2019_10xtechnology_szabo_007_10.1038/s41467-019-12464-3
loading human_lung_2019_10xtechnology_szabo_008_10.1038/s41467-019-12464-3


In [17]:
szabo1 = univ.datasets[univ.ids[0]].adata.copy()
szabo2 = univ.datasets[univ.ids[1]].adata.copy()
szabo3 = univ.datasets[univ.ids[2]].adata.copy()
szabo4 = univ.datasets[univ.ids[3]].adata.copy()

In [18]:
szabo1.obs["condition"] = univ.datasets[univ.ids[0]].state_exact
szabo2.obs["condition"] = univ.datasets[univ.ids[1]].state_exact
szabo3.obs["condition"] = univ.datasets[univ.ids[2]].state_exact
szabo4.obs["condition"] = univ.datasets[univ.ids[3]].state_exact

In [19]:
szabo = szabo1.concatenate(szabo2, szabo3, szabo4)

In [20]:
del szabo.obs["batch"]
szabo.obs["study"] = "Sims2019"
szabo.obs["dataset"] = szabo.obs["study"]
szabo.obs["sample"] = [i[:5] for i in szabo.obs.index]

szabodict = {"donor": "subject_ID", "cell_ontology_class": "original_celltype_ann"}
szabo.obs = szabo.obs.rename(columns=szabodict)

In [21]:
szabo = szabo[szabo.obs["original_celltype_ann"] != "unknown"].copy()

In [22]:
szabo.obs

Unnamed: 0,subject_ID,original_celltype_ann,condition,study,dataset,sample
PP001nskept.CAAGATCTCTTGGGTA-0,Donor 1,10.CD8EM/TRMact,healthy,Sims2019,Sims2019,PP001
PP001nskept.CATCCACGTCACCTAA-0,Donor 1,5.CD4TRMrest,healthy,Sims2019,Sims2019,PP001
PP001nskept.TTGAACGCACTAAGTC-0,Donor 1,6.CD4TRMact,healthy,Sims2019,Sims2019,PP001
PP001nskept.CTACGTCCAAGTACCT-0,Donor 1,5.CD4TRMrest,healthy,Sims2019,Sims2019,PP001
PP001nskept.ACTTGTTCATGCCTTC-0,Donor 1,6.CD4TRMact,healthy,Sims2019,Sims2019,PP001
...,...,...,...,...,...,...
PP010nskept.GTGCTTCAGGATATAC-3,Donor 2,5.CD4act3,stimulated,Sims2019,Sims2019,PP010
PP010nskept.TCATTTGAGAGCTTCT-3,Donor 2,8.CD8EM/TRMact,stimulated,Sims2019,Sims2019,PP010
PP010nskept.CGTAGGCTCTTGGGTA-3,Donor 2,8.CD8EM/TRMact,stimulated,Sims2019,Sims2019,PP010
PP010nskept.CCACTACCAGGTGCCT-3,Donor 2,8.CD8EM/TRMact,stimulated,Sims2019,Sims2019,PP010


In [23]:
szabo

AnnData object with n_obs × n_vars = 10790 × 60725
    obs: 'subject_ID', 'original_celltype_ann', 'condition', 'study', 'dataset', 'sample'
    var: 'Gene', 'Accession'

In [24]:
szabo.var.index = szabo.var["Gene"].tolist()
szabo.var_names_make_unique()



In [25]:
szabo.write("../../../data/HLCA_extended/extension_datasets/ready/full/szabo.h5ad")

... storing 'subject_ID' as categorical
... storing 'original_celltype_ann' as categorical
... storing 'condition' as categorical
... storing 'study' as categorical
... storing 'dataset' as categorical
... storing 'sample' as categorical


In [26]:
szabo_sub = subset_and_pad_adata(gene_set, szabo)
szabo_sub

not all genes were recovered, filling in 0 counts for 2 missing genes...


AnnData object with n_obs × n_vars = 10790 × 2000
    obs: 'subject_ID', 'original_celltype_ann', 'condition', 'study', 'dataset', 'sample'
    var: 'Gene', 'Accession', 'gene_symbols', 'ensembl'

In [27]:
szabo_sub.write("../../../data/HLCA_extended/extension_datasets/ready/subsetted/szabo_sub.h5ad")

... storing 'Gene' as categorical
... storing 'Accession' as categorical
... storing 'gene_symbols' as categorical
