## Fetal lung on pan-fetal immune

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os,sys
import glob
import scanpy as sc 
import pandas as pd
import numpy as np
import milopy
import scipy
import anndata

import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

sns.set_context('talk')
from matplotlib.pyplot import rc_context

import milopy
import milopy.core as milo

In [3]:
# from oor_benchmark.metrics import auprc, FDR_TPR_FPR

In [4]:
## r2py setup
import rpy2.rinterface_lib.callbacks
import logging
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

In [5]:
%load_ext rpy2.ipython

In [6]:
%%R
library(tidyverse)

[0;1;31mSystem has not been booted with systemd as init system (PID 1). Can't operate.[0m
[0;1;31mFailed to create bus connection: Host is down[0m


In [7]:
## Some utils

def _plot_nhood_sizes(adata, bins=50, **kwargs):
    plt.hist(np.array(adata.obsm['nhoods'].sum(0)).ravel(), bins=bins, **kwargs);
    plt.xlabel('# cells');plt.ylabel('# nhoods');
    
def get_nhood_adata(adata):
    return(adata.uns['nhood_adata'])

def get_cells_from_nhood(adata, i):
    return((adata.obsm['nhoods'][:,i].toarray() == 1).flatten())

def remove_x_axis(ax):
    ax.xaxis.set_major_locator(matplotlib.ticker.NullLocator());
    ax.set_xlabel('');

def remove_y_axis(ax):
    ax.yaxis.set_major_locator(matplotlib.ticker.NullLocator());
    ax.set_ylabel('');
    
def make_ct_label(label):
    return(' '.join(label.split('_')))

In [8]:
sns.color_palette('Paired')

In [9]:
## Define color palette
pal = sns.color_palette('Paired').as_hex()
design_palette = {'ACR':pal[3], 'CR':pal[7], 'AR':pal[9], 'joint_CR':pal[6]}
design_palette_df = pd.DataFrame(design_palette.values(), index =design_palette.keys() )
design_palette_df.columns = ['color']

In [10]:
# figdir = '/home/jovyan/mount/gdrive/diff2atlas/figures/pbmc_design_comparison_Embedding_methods/'
# if not os.path.exists(figdir):
#     os.mkdir(figdir)

In [11]:
# %%R -i design_palette_df -i figdir
# design_palette = setNames(design_palette_df$color, rownames(design_palette_df))

## Prep atlas dataset 

Downloaded from [here](https://cellgeni.cog.sanger.ac.uk/developmentcellatlas/fetal-immune/PAN.A01.v01.raw_count.20210429.PFI.embedding.h5ad)

In [12]:
## Subsample by sample
np.random.seed(2345)

def _subsample(adata, sample_col, n_cells_sample = 1000):
    s_obs = pd.Series() 
    for s in adata.obs[sample_col].unique():
        s_obs_i = adata.obs_names[adata.obs[sample_col] == s]
        if s_obs_i.shape[0] > n_cells_sample:
            obs_df = adata.obs.loc[s_obs_i]
            obs_df[sample_col] = obs_df[sample_col].astype('str')
            s_obs_i = obs_df[[sample_col]].groupby(
                sample_col).sample(n_cells_sample).index
        s_obs = pd.concat([s_obs, pd.Series(s_obs_i)])
    s_obs = s_obs.values
    return(s_obs)

In [13]:
h5ad_file = '/nfs/team205/ed6/data/Fetal_immune/cellxgene_h5ad_files/scRNA_data/PAN.A01.v01.raw_count.20210429.PFI.embedding.h5ad'
adata_atlas = sc.read_h5ad(h5ad_file, backed=True)

In [14]:
## Remove low Q
lowQ_clusters = [
    "DOUBLET_IMMUNE_FIBROBLAST",
    "LOW_Q_INCONSISTENT",
    "DOUBLET_LYMPHOID_MACROPHAGE",
    "LOW_QUALITY",
    "HIGH_MITO",
    "DOUBLETS_FIBRO_ERY",
    "DOUBLET_ENDOTHELIUM_ERYTHROCYTE",
    "DOUBLET_ERY_B",
    "LOW_QUALITY_MACROPHAGE",
    "LOW_QUALITY_MID_ERY_(HIGH_RIBO)",
    "PLACENTAL_CONTAMINANTS",
    "DOUBLET",
    ]

keep_obs = adata_atlas.obs_names[
    (~adata_atlas.obs['organ'].isna()) & # remove maternal contaminants
    (~adata_atlas.obs['celltype_annotation'].isin(lowQ_clusters)) &  # remove low quality
    (adata_atlas.obs['age'] >= 14) # Filter by age
] 

keep_obs = _subsample(adata_atlas[keep_obs], sample_col='Sample', n_cells_sample=2000)

  s_obs = pd.Series()


In [15]:
len(keep_obs)

191830

In [16]:
adata_atlas = adata_atlas[keep_obs].to_memory()

In [17]:
print('--- Atlas ---')
print('# donors:', adata_atlas.obs['donor'].nunique())
print('# samples:', adata_atlas.obs['Sample'].nunique())

--- Atlas ---
# donors: 11
# samples: 99


## Prep case-control dataset

Downloaded from [cellxgene](https://cellxgene.cziscience.com/collections/2d2e2acd-dade-489f-a2da-6c11aa654028)

In [18]:
h5ad_file = '/nfs/team205/ed6/data/He2020_fetal_lung.h5ad'
adata_query = sc.read_h5ad(h5ad_file, backed=True)

In [19]:
query_ages = ['18th week post-fertilization human stage', 
 '22nd week post-fertilization human stage', 
 '15th week post-fertilization human stage', 
 '20th week post-fertilization human stage', 
 ]

keep_obs = adata_query.obs_names[
    adata_query.obs['development_stage'].isin(query_ages)
]

keep_obs = _subsample(adata_query[keep_obs], sample_col='batch', n_cells_sample=1000)

  s_obs = pd.Series()


In [20]:
adata_query = adata_query[keep_obs].to_memory()

In [21]:
print('--- Query ---')
print('# donors:', adata_query.obs['donor_id'].nunique())
print('# samples:', adata_query.obs['batch'].nunique())

--- Query ---
# donors: 8
# samples: 22


Uniform metadata

In [23]:
adata_query.obs['method'] = '5GEX'
adata_query.obs['Sample'] = adata_query.obs['batch'].copy()
adata_query.obs['donor'] = adata_query.obs['donor_id'].copy()

In [31]:
## Set ensemblIDs to var names
adata_atlas.var_names = adata_atlas.var['GeneID'].values

In [37]:
## Keep common vars
adata_query = adata_query[:, np.intersect1d(adata_query.var_names, adata_atlas.var_names)].copy()
adata_atlas = adata_atlas[:, np.intersect1d(adata_query.var_names, adata_atlas.var_names)].copy()

### Save for simulations

In [38]:
adata_atlas.write_h5ad('/lustre/scratch117/cellgen/team205/ed6/OOR_benchmark_fetal/PFI_atlas.subsampled.h5ad')
adata_query.write_h5ad('/lustre/scratch117/cellgen/team205/ed6/OOR_benchmark_fetal/FL_query.subsampled.h5ad')

### Simulate disease

In [45]:
from oor_benchmark.datasets.simulation import simulate_query_reference