# Data import for Schiller_2021 (unpublished) data:

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
from anndata import AnnData, concat

In [2]:
adata = sc.read("../../../data/HLCA_extended/extension_datasets/raw/Schiller_2/210714_ASK_controls_annotated.h5ad")

In [3]:
adata_raw = AnnData(X=adata.layers["counts"], obs=adata.obs, var=adata.var)
adata_raw

AnnData object with n_obs × n_vars = 35984 × 17533
    obs: 'condition', 'name', 'project', 'identifier', 'n_counts', 'n_genes', 'percent_mito', 'QC_group', 'doublet_scores', 'size_factors', 'S_score', 'G2M_score', 'phase', 'louvain', 'leiden_1', 'compartments', 'cell_type_auto', 'cell_type_sub_auto'

# Ensure consistent naming

In [4]:
adata_raw.obs.rename(columns={"name": "subject_ID",
                              "cell_type_auto": "original_celltype_ann"}, inplace=True)

In [5]:
adata_raw.obs["sample"] = adata_raw.obs.identifier
adata_raw.obs["study"] = "Schiller2021"
adata_raw.obs["dataset"] = adata_raw.obs.study

In [6]:
adata_raw.obs.condition.replace({"control": "healthy"}, inplace=True)

# Remove unnecessary obs columns

In [7]:
adata_raw.obs.drop(columns=["identifier", "n_counts", "n_genes", "percent_mito", "QC_group", "doublet_scores", "size_factors", "S_score", "G2M_score", "phase", "louvain", "leiden_1", "compartments", "cell_type_sub_auto"], inplace=True)

# Add age & sex & disease

In [8]:
def add_age (row):
   if row['subject_ID'] == "ASK591":
      return '21'
   if row['subject_ID'] == "ASK592":
      return '71'
   if row['subject_ID'] == "ASK594":
      return '71'
   if row['subject_ID'] == "ASK595":
      return '62'
   if row['subject_ID'] == "M179opf":
      return '54'
   if row['subject_ID'] == "ASK598":
      return '72'
   if row['subject_ID'] == "ASK599":
      return '75'
   if row['subject_ID'] == "ASK600":
      return '81'
   if row['subject_ID'] == "ASK602":
      return '67'
   if row['subject_ID'] == "ASK603":
      return '60'
   if row['subject_ID'] == "ASK604":
      return '65'
   if row['subject_ID'] == "M870yhk":
      return '63'
   if row['subject_ID'] == "ASK606":
      return '47'
   if row['subject_ID'] == "ASK607":
      return '71'
   if row['subject_ID'] == "M051mrh":
      return '62'
   if row['subject_ID'] == "ASK608":
      return '72'
   if row['subject_ID'] == "ASK610":
      return '70'
   if row['subject_ID'] == "ASK611":
      return '67'
   return 'undefined'

adata_raw.obs['age'] = adata_raw.obs.apply (lambda row: add_age(row), axis=1)

In [9]:
def add_sex (row):
   if row['subject_ID'] == "ASK591":
      return 'female'
   if row['subject_ID'] == "ASK592":
      return 'male'
   if row['subject_ID'] == "ASK594":
      return 'male'
   if row['subject_ID'] == "ASK595":
      return 'female'
   if row['subject_ID'] == "M179opf":
      return 'male'
   if row['subject_ID'] == "ASK598":
      return 'male'
   if row['subject_ID'] == "ASK599":
      return 'female'
   if row['subject_ID'] == "ASK600":
      return 'male'
   if row['subject_ID'] == "ASK602":
      return 'female'
   if row['subject_ID'] == "ASK603":
      return 'female'
   if row['subject_ID'] == "ASK604":
      return 'female'
   if row['subject_ID'] == "M870yhk":
      return 'male'
   if row['subject_ID'] == "ASK606":
      return 'female'
   if row['subject_ID'] == "ASK607":
      return 'female'
   if row['subject_ID'] == "M051mrh":
      return 'male'
   if row['subject_ID'] == "ASK608":
      return 'female'
   if row['subject_ID'] == "ASK610":
      return 'female'
   if row['subject_ID'] == "ASK611":
      return 'female'
   return 'undefined'

adata_raw.obs['sex'] = adata_raw.obs.apply (lambda row: add_sex(row), axis=1)

In [10]:
"""
def add_disease (row):
   if row['subject_ID'] == "muc10380":
      return 'Donor'
   if row['subject_ID'] == "muc10381":
      return 'IPF'
   if row['subject_ID'] == "muc3843":
      return 'Donor'
   if row['subject_ID'] == "muc4658":
      return 'Donor'
   if row['subject_ID'] == "muc4659":
      return 'Donor'
   if row['subject_ID'] == "muc5103":
      return 'Donor'
   if row['subject_ID'] == "muc5104":
      return 'Donor'
   if row['subject_ID'] == "muc5105":
      return 'Donor'
   if row['subject_ID'] == "muc5212":
      return 'Donor'
   if row['subject_ID'] == "muc5213":
      return 'Donor'
   if row['subject_ID'] == "muc5288":
      return 'Donor'
   if row['subject_ID'] == "muc5289":
      return 'COPD'
   if row['subject_ID'] == "muc8257":
      return 'EAA'
   if row['subject_ID'] == "muc8258":
      return 'EAA'
   if row['subject_ID'] == "muc9826":
      return 'IPF'
   if row['subject_ID'] == "muc9832":
      return 'Donor'
   if row['subject_ID'] == "muc9833":
      return 'Donor'
   return 'undefined'

adata_raw.obs['disease'] = adata_raw.obs.apply (lambda row: add_disease(row), axis=1)
"""

'\ndef add_disease (row):\n   if row[\'subject_ID\'] == "muc10380":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc10381":\n      return \'IPF\'\n   if row[\'subject_ID\'] == "muc3843":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc4658":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc4659":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc5103":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc5104":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc5105":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc5212":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc5213":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc5288":\n      return \'Donor\'\n   if row[\'subject_ID\'] == "muc5289":\n      return \'COPD\'\n   if row[\'subject_ID\'] == "muc8257":\n      return \'EAA\'\n   if row[\'subject_ID\'] == "muc8258":\n      return \'EAA\'\n   if row[\'subject_ID\'] == "muc9826":\n      return \'IPF\'\n   if

# Rename cell types

In [11]:
adata_raw.obs.original_celltype_ann.replace({"CD4 Na": "Natural CD4 T-cells",
                                             "CD8 M/E": "Memory/Effector CD8 T-cells",
                                             "CD8 Na": "Natural CD8 T-cells",
                                             "Mono class": "Classical Monocytes",
                                             "DC IGSF21": "Dendritic cells",
                                             "MKI67+ cells": "Proliferating cells",
                                             "Neu": "Neutrophils"}, inplace=True)

In [12]:
adata_raw.obs.original_celltype_ann = adata_raw.obs.original_celltype_ann.astype("category")

# Rename ASK IDs to anonymous IDs

In [13]:
adata_raw.obs.subject_ID.replace({"ASK591": "Patient 1",
                                  "ASK592": "Patient 2",
                                  "ASK595": "Patient 3",
                                  "ASK598": "Patient 4",
                                  "ASK600": "Patient 5",
                                  "ASK602": "Patient 6",
                                  "ASK604": "Patient 7"}, inplace=True)

# Subset to 2000 HVGs

In [14]:
def subset_and_pad_adata(adata, gene_set):
    """
    This function uses a gene list provided as a Pandas dataframe with gene symbols and
    Ensembl IDs and subsets a larger Anndata object to only the genes in this list. If
    Not all genes are found in the AnnData object, then zero-padding is performed.
    """
    # Example inputs:
    # genes_filename = '/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/genes_for_mapping.csv'
    # data_filename = '/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/ready/adams.h5ad'
    # gene_set = pd.read_csv(genes_filename)
    # adata = sc.read(data_filename)

    # Prep objects
    if 'gene_symbols' in gene_set.columns:
        gene_set.index = gene_set['gene_symbols']

    else:
        raise ValueError('The input gene list was not of the expected type!\n'
                         'Gene symbols and ensembl IDs are expected in column names:\n'
                         '\t`gene_symbols` and `Unnamed: 0`')

    # Subset adata object
    common_genes = [gene for gene in gene_set['gene_symbols'].values if gene in adata.var_names]
    if len(common_genes) == 0:
        print("WARNING: YOU SHOULD PROBABLY SWITCH YOUR ADATA.VAR INDEX COLUMN TO GENE NAMES"
                  " RATHER THAN IDS! No genes were recovered.")
        return

    adata_sub = adata[:,common_genes].copy()

    # Pad object with 0 genes if needed
    if len(common_genes) < len(gene_set):
        diff = len(gene_set) - len(common_genes)
        print(f'not all genes were recovered, filling in 0 counts for {diff} missing genes...')
        
        # Genes to pad with
        genes_to_add = set(gene_set['gene_symbols'].values).difference(set(adata_sub.var_names))
        new_var = gene_set.loc[genes_to_add]
        
        if 'Unnamed: 0' in new_var.columns:
            # Assumes the unnamed column are ensembl values
            new_var['ensembl'] = new_var['Unnamed: 0']
            del new_var['Unnamed: 0']

        df_padding = pd.DataFrame(data=np.zeros((adata_sub.shape[0],len(genes_to_add))), index=adata_sub.obs_names, columns=new_var.index)
        adata_padding = sc.AnnData(df_padding, var=new_var)
        
        # Concatenate object
        adata_sub = concat([adata_sub, adata_padding], axis=1, join='outer', index_unique=None, merge='unique')

    # Ensure ensembl IDs are available
    adata_sub.var['ensembl'] = gene_set['Unnamed: 0']

    return adata_sub

In [15]:
gene_set = pd.read_csv("genes_for_mapping.csv")
adata_raw_subsetted = subset_and_pad_adata(adata_raw, gene_set)

not all genes were recovered, filling in 0 counts for 183 missing genes...


# Write out object

In [16]:
adata_raw.write("../../../data/HLCA_extended/extension_datasets/ready/full/schiller_discovair.h5ad")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'subject_ID' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'study' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'age' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical


In [17]:
adata_raw_subsetted.write("../../../data/HLCA_extended/extension_datasets/ready/subsetted/schiller_discovair_sub.h5ad")

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'subject_ID' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'study' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dataset' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'age' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'gene_symbols' as categorical


In [18]:
test = sc.read("../../../data/HLCA_extended/extension_datasetsready/subsetted/schiller_discovair_sub.h5ad")