### Notebook to preprocess the data from Janssen_2020/Mould dataset

### Import required modules

In [1]:
import anndata
import scipy as sp
import numpy as np
import pandas as pd
import scanpy as sc

- Set up working environment

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 200, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')



-----
anndata     0.7.4
scanpy      1.7.2
sinfo       0.3.1
-----
PIL                 8.0.0
anndata             0.7.4
backcall            0.1.0
cloudpickle         1.2.2
cycler              0.10.0
cython_runtime      NA
dask                2.9.1
dateutil            2.8.1
decorator           4.4.1
get_version         2.1+py3.7.egg
google              NA
h5py                2.10.0
igraph              0.8.3
ipykernel           5.1.3
ipython_genutils    0.2.0
ipywidgets          7.5.1
jedi                0.15.1
joblib              0.14.0
kiwisolver          1.1.0
legacy_api_wrap     1.2+py3.7.egg
leidenalg           0.8.3
llvmlite            0.36.0
louvain             0.6.1
matplotlib          3.3.2
more_itertools      NA
mpl_toolkits        NA
natsort             6.2.0
numba               0.53.1
numexpr             2.7.0
numpy               1.17.4
packaging           20.4
pandas              1.0.5
parso               0.5.1
pexpect             4.7.0
pickleshare         0.7.5
pkg_resources 

### Add annotations from author's website. 

- Malte found the annotations [here](https://cells.ucsc.edu/?ds=healthy-bal&meta=cell_type).

In [3]:
metadata = pd.read_csv('/Users/ctl/Downloads/mould_meta.tsv', sep = '\t', index_col = 0)
metadata.head()

FileNotFoundError: [Errno 2] File /Users/ctl/Downloads/mould_meta.tsv does not exist: '/Users/ctl/Downloads/mould_meta.tsv'

### Read in dataset

In [None]:
adata_1 = sc.read_csv('./GSE151928_RAW/GSM4593888_sample_1_UMI_counts.csv.gz').T
adata_1.obs_names = [str(i).replace('.','_') for i in adata_1.obs_names]
meta_1 = metadata.loc[metadata['orig.ident'] == 'sample_1']
adata_1 = adata_1[meta_1.index]
adata_1.obs = meta_1.copy()
adata_1.obs['subject_ID'] = 'SAMN15153963'
adata_1

In [None]:
adata_2 = sc.read_csv('./GSE151928_RAW/GSM4593889_sample_2_UMI_counts.csv.gz').T
adata_2.obs_names = [str(i).replace('.','_') for i in adata_2.obs_names]
adata_2.obs_names = [str(i).replace('_1','_2') for i in adata_2.obs_names]
meta_2 = metadata.loc[metadata['orig.ident'] == 'sample_2']
adata_2 = adata_2[meta_2.index]
adata_2.obs = meta_2.copy()
adata_2.obs['subject_ID'] = 'SAMN15153962'
adata_2

In [None]:
adata_3 = sc.read_csv('./GSE151928_RAW/GSM4593890_sample_3_UMI_counts.csv.gz').T
adata_3.obs_names = [str(i).replace('.','_') for i in adata_3.obs_names]
adata_3.obs_names = [str(i).replace('_1','_3') for i in adata_3.obs_names]
meta_3 = metadata.loc[metadata['orig.ident'] == 'sample_3']
adata_3 = adata_3[meta_3.index]
adata_3.obs = meta_3.copy()
adata_3.obs['subject_ID'] = 'SAMN15153961'
adata_3

In [None]:
adata_4 = sc.read_csv('./GSE151928_RAW/GSM4593891_sample_4_UMI_counts.csv.gz').T
adata_4.obs_names = [str(i).replace('.','_') for i in adata_4.obs_names]
adata_4.obs_names = [str(i).replace('_1','_4') for i in adata_4.obs_names]
meta_4 = metadata.loc[metadata['orig.ident'] == 'sample_4']
adata_4 = adata_4[meta_4.index]
adata_4.obs = meta_4.copy()
adata_4.obs['subject_ID'] = 'SAMN15153960'
adata_4

In [None]:
adata_5 = sc.read_csv('./GSE151928_RAW/GSM4593892_sample_5_UMI_counts.csv.gz').T
adata_5.obs_names = [str(i).replace('.','_') for i in adata_5.obs_names]
adata_5.obs_names = [str(i).replace('_1','_5') for i in adata_5.obs_names]
meta_5 = metadata.loc[metadata['orig.ident'] == 'sample_5']
adata_5 = adata_5[meta_5.index]
adata_5.obs = meta_5.copy()
adata_5.obs['subject_ID'] = 'SAMN15153959'
adata_5

In [None]:
adata_6 = sc.read_csv('./GSE151928_RAW/GSM4593893_sample_6_UMI_counts.csv.gz').T
adata_6.obs_names = [str(i).replace('.','_') for i in adata_6.obs_names]
adata_6.obs_names = [str(i).replace('_1','_6') for i in adata_6.obs_names]
meta_6 = metadata.loc[metadata['orig.ident'] == 'sample_6']
adata_6 = adata_6[meta_6.index]
adata_6.obs = meta_6.copy()
adata_6.obs['subject_ID'] = 'SAMN15153958'
adata_6

In [10]:
adata_7 = sc.read_csv('./GSE151928_RAW/GSM4593894_sample_7_UMI_counts.csv.gz').T
adata_7.obs_names = [str(i).replace('.','_') for i in adata_7.obs_names]
adata_7.obs_names = [str(i).replace('_1','_7') for i in adata_7.obs_names]
meta_7 = metadata.loc[metadata['orig.ident'] == 'sample_7']
adata_7 = adata_7[meta_7.index]
adata_7.obs = meta_7.copy()
adata_7.obs['subject_ID'] = 'SAMN15153957'
adata_7

AnnData object with n_obs × n_vars = 5276 × 33538
    obs: 'orig.ident', 'sample_number', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'nCount_SCT', 'nFeature_SCT', 'overall_cluster', 'macrophage_cluster', 'monocyte_cluster', 'cell_type', 'subject_ID'

In [11]:
adata_8 = sc.read_csv('./GSE151928_RAW/GSM4593895_sample_8_UMI_counts.csv.gz').T
adata_8.obs_names = [str(i).replace('.','_') for i in adata_8.obs_names]
adata_8.obs_names = [str(i).replace('_1','_8') for i in adata_8.obs_names]
meta_8 = metadata.loc[metadata['orig.ident'] == 'sample_8']
adata_8 = adata_8[meta_8.index]
adata_8.obs = meta_8.copy()
adata_8.obs['subject_ID'] = 'SAMN15153956'
adata_8

AnnData object with n_obs × n_vars = 6333 × 33538
    obs: 'orig.ident', 'sample_number', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'nCount_SCT', 'nFeature_SCT', 'overall_cluster', 'macrophage_cluster', 'monocyte_cluster', 'cell_type', 'subject_ID'

In [12]:
adata_9 = sc.read_csv('./GSE151928_RAW/GSM4593896_sample_9_UMI_counts.csv.gz').T
adata_9.obs_names = [str(i).replace('.','_') for i in adata_9.obs_names]
adata_9.obs_names = [str(i).replace('_1','_9') for i in adata_9.obs_names]
meta_9 = metadata.loc[metadata['orig.ident'] == 'sample_9']
adata_9 = adata_9[meta_9.index]
adata_9.obs = meta_9.copy()
adata_9.obs['subject_ID'] = 'SAMN15153955'
adata_9

AnnData object with n_obs × n_vars = 2486 × 33538
    obs: 'orig.ident', 'sample_number', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'nCount_SCT', 'nFeature_SCT', 'overall_cluster', 'macrophage_cluster', 'monocyte_cluster', 'cell_type', 'subject_ID'

In [13]:
adata_10 = sc.read_csv('./GSE151928_RAW/GSM4593897_sample_10_UMI_counts.csv.gz').T
adata_10.obs_names = (adata_10.obs_names + '-1_10')
meta_10 = metadata.loc[metadata['orig.ident'] == 'sample_10']
adata_10 = adata_10[meta_10.index]
adata_10.obs = meta_10.copy()
adata_10.obs['subject_ID'] = 'GSM4593897'
adata_10

AnnData object with n_obs × n_vars = 3816 × 33538
    obs: 'orig.ident', 'sample_number', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'nCount_SCT', 'nFeature_SCT', 'overall_cluster', 'macrophage_cluster', 'monocyte_cluster', 'cell_type', 'subject_ID'

### Merge samples

In [14]:
GSE151928 = adata_1.concatenate(adata_2, adata_3, adata_4, adata_5, adata_6, adata_7, adata_8, adata_9, adata_10, batch_key = 'sample', batch_categories = ['PX1', 'PX2', 'PX3', 'PX4', 'PX5', 'PX6', 'PX7', 'PX8', 'PX9','PX10'], join = 'inner')
GSE151928.obs['genome'] = 'GRCh38'
GSE151928

AnnData object with n_obs × n_vars = 49384 × 33538
    obs: 'orig.ident', 'sample_number', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'nCount_SCT', 'nFeature_SCT', 'overall_cluster', 'macrophage_cluster', 'monocyte_cluster', 'cell_type', 'subject_ID', 'sample', 'genome'

### Add authors metadata

In [15]:
GSE151928.obs_names

Index(['AAACCCAAGCGTCTGC_1-PX1', 'AAACCCAAGTGATAAC_1-PX1',
       'AAACCCAAGTTCCAGT_1-PX1', 'AAACCCACACTGTTCC_1-PX1',
       'AAACCCAGTATTTCGG_1-PX1', 'AAACCCAGTGAGAACC_1-PX1',
       'AAACCCAGTGGCTCTG_1-PX1', 'AAACCCATCGCTCCTA_1-PX1',
       'AAACGAACAATGTCAC_1-PX1', 'AAACGAACATTCGATG_1-PX1',
       ...
       'TTTGGAGGTAGGGTAC-1_10-PX10', 'TTTGGAGGTCAATGGG-1_10-PX10',
       'TTTGGAGTCTAGAACC-1_10-PX10', 'TTTGGTTCAATTTCCT-1_10-PX10',
       'TTTGGTTTCAATCTCT-1_10-PX10', 'TTTGGTTTCCCAACTC-1_10-PX10',
       'TTTGGTTTCTGCTGAA-1_10-PX10', 'TTTGTTGCACGTCATA-1_10-PX10',
       'TTTGTTGGTAAGGTCG-1_10-PX10', 'TTTGTTGTCAACACCA-1_10-PX10'],
      dtype='object', length=49384)

### Is `adata.X` raw?

In [16]:
GSE151928.X[:5,:5]

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]], dtype=float32)

### Add basic labels

In [17]:
GSE151928.obs['dataset'] = 'GSE151928'
GSE151928.obs['study'] = 'Mould,2020'
GSE151928.obs['original_celltype_ann'] = GSE151928.obs['cell_type']
GSE151928.obs['condition'] = 'healthy'
GSE151928

AnnData object with n_obs × n_vars = 49384 × 33538
    obs: 'orig.ident', 'sample_number', 'nCount_RNA', 'nFeature_RNA', 'percent.mito', 'percent.ribo', 'nCount_SCT', 'nFeature_SCT', 'overall_cluster', 'macrophage_cluster', 'monocyte_cluster', 'cell_type', 'subject_ID', 'sample', 'genome', 'dataset', 'study', 'original_celltype_ann', 'condition'

In [18]:
del(GSE151928.obs['orig.ident'])
del(GSE151928.obs['sample_number'])
del(GSE151928.obs['nCount_RNA'])
del(GSE151928.obs['nFeature_RNA'])
del(GSE151928.obs['percent.mito'])
del(GSE151928.obs['percent.ribo'])
del(GSE151928.obs['nCount_SCT'])
del(GSE151928.obs['nFeature_SCT'])
del(GSE151928.obs['overall_cluster'])
del(GSE151928.obs['macrophage_cluster'])
del(GSE151928.obs['monocyte_cluster'])
del(GSE151928.obs['subject_ID'])
del(GSE151928.obs['sample'])
del(GSE151928.obs['cell_type'])

In [19]:
GSE151928.var.head()

MIR1302-2HG
FAM138A
OR4F5
AL627309.1
AL627309.3


In [20]:
GSE151928.X = sp.sparse.csr_matrix(GSE151928.X)
GSE151928.X[:5,:5].todense()

matrix([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]], dtype=float32)

In [21]:
GSE151928.write('GSE151928_preprocessed.raw.ctl210706.h5ad')

... storing 'genome' as categorical
... storing 'dataset' as categorical
... storing 'study' as categorical
... storing 'original_celltype_ann' as categorical
... storing 'condition' as categorical


### Subset datasets to Sikkema's HVGs 

SOMETHING GOES WRONG HERE, CELLS ARE DOUBLED! CORRECT THIS...

In [22]:
def subset_and_pad_adata(adata, gene_set):
    """
    This function uses a gene list provided as a Pandas dataframe with gene symbols and
    Ensembl IDs and subsets a larger Anndata object to only the genes in this list. If
    Not all genes are found in the AnnData object, then zero-padding is performed.
    """
    # Example inputs:
    # genes_filename = '/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/genes_for_mapping.csv'
    # data_filename = '/storage/groups/ml01/workspace/hlca_lisa.sikkema_malte.luecken/ready/adams.h5ad'
    # gene_set = pd.read_csv(genes_filename)
    # adata = sc.read(data_filename)

    # Prep objects
    if 'gene_symbols' in gene_set.columns:
        gene_set.index = gene_set['gene_symbols']

    else:
        raise ValueError('The input gene list was not of the expected type!\n'
                         'Gene symbols and ensembl IDs are expected in column names:\n'
                         '\t`gene_symbols` and `Unnamed: 0`')

    # Subset adata object
    common_genes = [gene for gene in gene_set['gene_symbols'].values if gene in adata.var_names]
    if len(common_genes) == 0:
        print("WARNING: YOU SHOULD PROBABLY SWITCH YOUR ADATA.VAR INDEX COLUMN TO GENE NAMES"
                  " RATHER THAN IDS! No genes were recovered.")
        return

    adata_sub = adata[:,common_genes].copy()

    # Pad object with 0 genes if needed
    if len(common_genes) < len(gene_set):
        diff = len(gene_set) - len(common_genes)
        print(f'not all genes were recovered, filling in 0 counts for {diff} missing genes...')
        
        # Genes to pad with
        genes_to_add = set(gene_set['gene_symbols'].values).difference(set(adata_sub.var_names))
        new_var = gene_set.loc[genes_to_add]
        
        if 'Unnamed: 0' in new_var.columns:
            # Assumes the unnamed column are ensembl values
            new_var['ensembl'] = new_var['Unnamed: 0']
            del new_var['Unnamed: 0']

        df_padding = pd.DataFrame(data=np.zeros((adata_sub.shape[0],len(genes_to_add))), index=adata_sub.obs_names, columns=new_var.index)
        adata_padding = sc.AnnData(df_padding, var=new_var)
        
        # Concatenate object
        #adata_sub = concat([adata_sub, adata_padding], axis=1, join='outer', index_unique=None, merge='unique')
        adata_sub2 = adata_sub.concatenate(adata_padding, batch_key = 'process', batch_categories = ['sub', 'padded'], join = 'outer')

    # Ensure ensembl IDs are available
    adata_sub2.var['ensembl'] = gene_set['Unnamed: 0']

    return adata_sub2

In [23]:
HVG = pd.read_csv('/Users/ctl/Downloads/genes_for_mapping.csv')
HVG.head()

Unnamed: 0.1,Unnamed: 0,gene_symbols
0,ENSG00000000938,FGR
1,ENSG00000000971,CFH
2,ENSG00000002587,HS3ST1
3,ENSG00000002933,TMEM176A
4,ENSG00000003436,TFPI


In [24]:
GSE151928_subset = subset_and_pad_adata(GSE151928, HVG)
del(GSE151928_subset.obs['process'])
GSE151928_subset

not all genes were recovered, filling in 0 counts for 92 missing genes...


AnnData object with n_obs × n_vars = 98768 × 2000
    obs: 'genome', 'dataset', 'study', 'original_celltype_ann', 'condition'
    var: 'gene_symbols-padded', 'ensembl-padded', 'ensembl'

In [25]:
GSE151928_subset.write('GSE151928_preprocessed-HVGs.raw.ctl210706.h5ad')

... storing 'genome' as categorical
... storing 'dataset' as categorical
... storing 'study' as categorical
... storing 'original_celltype_ann' as categorical
... storing 'condition' as categorical
... storing 'gene_symbols-padded' as categorical
... storing 'ensembl-padded' as categorical
