
# Preprocessing - Integration
Adapted from Michael Sterr

2024-05-18 


# Setup


In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import session_info
import gc # Free memory #gc.collect()
import scipy.stats as stats

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import cm
import seaborn as sb

# Analysis
import scvi
import torch
import scanpy as sc
import anndata as ad

In [None]:
# Settings

import warnings
warnings.filterwarnings("ignore")

## Directory
base_dir = '/mnt/hdd/'
data_dir = 'data/Healthy/'
nb_dir = 'Notebooks/Gut_project/'
sc.settings.figdir = base_dir + nb_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()
session_info.show()

In [None]:
%run utils.ipynb

In [None]:
mymap = load_RdOrYl_cmap_settings()

# Setup R

In [None]:
#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')

In [None]:
%%R

.libPaths()

In [None]:
%%R
library(scry)

# Parallelization
library(BiocParallel)
register(MulticoreParam(20, progressbar = TRUE))

library(future)
plan(multicore, workers = 20)
options(future.globals.maxSize = 64 * 1024 ^ 3) # for 50 Gb RAM
plan()

#library(doParallel)
#registerDoParallel(20)

sessionInfo()

# Load Data

In [None]:
adata1 = sc.read_h5ad('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_meta.h5ad')
adata= sc.read_h5ad('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_2_meta.h5ad')

In [None]:
adata.obsm['X_MultiVI_meta'] = adata1.obsm['X_MultiVI_meta']

In [None]:
adata

In [None]:
del adata1
gc.collect()

In [None]:
all_cc_genes, s_genes_regev, g2m_genes_regev, cc_genes_regev, cc_genes_macosko, s_genes_macosko, g2m_genes_macosko, m_genes_macosko, mg1_genes_macosko, g1s_genes_macosko = load_cell_cycle_genes(adata, genome='mus_musculus')

In [None]:
sc.pp.neighbors(adata, use_rep="X_MultiVI_rmDoublets_meta", n_pcs=50, n_neighbors=20)
sc.tl.umap(adata, min_dist=0.3, spread=0.8, negative_sample_rate=1, gamma=0.5)

In [None]:
sc.tl.leiden(adata, resolution=1)

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type'], size=10, add_outline=True, alpha=1,wspace =0.9, outline_width=(0.3, 0.0), ncols=4, color_map=mymap)

# HVGs

In [None]:
adata_raw = ad.AnnData(X=adata.layers['raw_counts'])

In [None]:
%%R -i adata_raw
sce = devianceFeatureSelection(adata_raw, assay='X')

In [None]:
binomial_deviance = ro.r('rowData(sce)$binomial_deviance').T

In [None]:
binomial_deviance

In [None]:
idx = binomial_deviance.argsort()[-4000:]
mask = np.zeros(adata.var_names.shape, dtype=bool)
mask[idx] = True

adata.var['binomial_deviance'] = binomial_deviance
adata.var['highly_deviant'] = mask
adata.var['highly_variable'] = mask

In [None]:
# Get HVGs and overlap with cell cycle & ambient genes

## HVGs
adata.var['highly_variable'] = adata.var['highly_deviant'].copy()
hvgs = pd.Series(adata.var_names[adata.var['highly_variable']])
print('\nHighly variable genes before filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

# overlap HVGs with CC genes
hvcc = list(hvgs[hvgs.isin(all_cc_genes)])
print('\nHighly variable cell cycle genes:',len(hvcc),'\n',hvcc)

# overlap HVGs with ambient genes
hvambi = list(hvgs[hvgs.isin(list(adata[:,adata.var['is_ambient'] == True].var_names))])
print('\nHighly variable ambient genes:',len(hvambi),'\n',hvambi)

# # remove cell cycle genes
adata.var.loc[hvcc,'highly_variable'] = False

# # remove ambient genes
adata.var.loc[hvambi,'highly_variable'] = False

print('\nHighly variable genes after filtering:',adata.var.loc[:,'highly_variable'].value_counts()[1])

In [None]:
del adata_raw
gc.collect()

# Initial Embedding

In [None]:
sc.pp.pca(adata, svd_solver='arpack', use_highly_variable=True)
sc.pl.pca_overview(adata)

In [None]:
sc.pp.neighbors(adata, use_rep="X_MultiVI_rmDoublets_meta", n_pcs=50, n_neighbors=20)
sc.tl.umap(adata, min_dist=0.3, spread=1, negative_sample_rate=5, gamma=0.5)
# sc.tl.umap(adata, min_dist=0.2, spread=0.5, negative_sample_rate=1, gamma=2)
# sc.tl.umap(adata, min_dist=0.3, spread=0.8, negative_sample_rate=0.5, gamma=1)
# sc.tl.umap(adata, min_dist=0.25, spread=0.8, negative_sample_rate=0.5, gamma=0.5)
# sc.tl.umap(adata, min_dist=0.3, spread=0.8, negative_sample_rate=0.5, gamma=0.25)
# sc.tl.umap(adata, min_dist=0.3, spread=1, negative_sample_rate=1, gamma=0.5)

In [None]:
marker_genes = ['Neurog3','Tph1','Isl1','Pou2f3', 'Sox9','Lgr5','Dmbt1','Hmgb2','Top2a','Defa24','Gna11','Cd52','Muc2','Fcgbp','Lyz1']

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=4, color_map=mymap, wspace =0.95)

In [None]:
plot_embedding_density_kde(adata=adata, groupby='sample', cmap_kde = mymap)

# RNA Integration

#### load metadata

In [None]:
## add metadata
metadata_df =read_excel_metadata(f'/mnt/hdd/data/metadata_mouse_gut.xlsx')
# Ensure folder name is the index in metadata for easier access
metadata_df.drop(metadata_df[metadata_df['kit'] == 'Multiome_ATAC_v1'].index, inplace=True)
metadata_df.drop(metadata_df[~metadata_df['condition'].isin(['Ctr','Ctr/WT'])].index, inplace=True)
metadata_df.set_index('folder name', inplace=True)
metadata_df.drop(['Sample Pooling - confounded with Project?','date','Project Name','Link_id','sample name','Cell Count [cells/µl]','Viable Cells [%]','Lib. Concentration [ng/µl]','Lib. Molarity [nM]','Average Lib. Size [bp]','cDNA Cycles','Lib. Cycles','10x Sample Index','Sequencing Depth [reads/cell]','MUC ID','exclusion, reason'], axis=1, inplace=True)

In [None]:
metadata_df

In [None]:
# Function to update adata.obs with metadata using a lambda function
for col in metadata_df.columns:
    try:
        adata.obs[col] = adata.obs['sample'].apply(lambda x: metadata_df.at[x, col])
    except KeyError as err:
        print(f'no such key: {err} in col {col}')

In [None]:
adata.obs

### free up memory

In [None]:
#adata.obs = adata.obs.astype({'sample number Minas': str})
adata.obs.drop('sample number Minas', axis = 1, inplace = True)

In [None]:
adata.obs = adata.obs.astype({'enrichment proportion': str})
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata.h5ad')

In [None]:
adata = sc.read_h5ad('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata.h5ad')
adata.obs = adata.obs.astype({'enrichment proportion': str})
gc.collect()

In [None]:
torch.cuda.empty_cache()

## SCVI

### Setup

In [None]:
adata_scvi = adata.copy()

In [None]:
n_hidden=512
n_latent=50
n_layers=2

batch_key = 'sample'
labels_key = 'initial_cell_type'

categorical_covariate_keys = ['kit']
continuous_covariate_keys = None

layer = 'raw_counts'

In [None]:
scvi.model.SCVI.setup_anndata(adata_scvi, layer=layer, batch_key=batch_key, labels_key=labels_key, categorical_covariate_keys=categorical_covariate_keys, continuous_covariate_keys=continuous_covariate_keys)

In [None]:
model_scvi = scvi.model.SCVI(adata_scvi, n_hidden=n_hidden, n_layers=n_layers, n_latent=n_latent, gene_likelihood='nb', dispersion='gene-batch')
print(model_scvi)
#model_scvi.view_anndata_setup()

### Train

In [None]:
torch.cuda.empty_cache()

In [None]:
model_scvi.train(max_epochs=1000, early_stopping=True)

In [None]:
# plot reconstruction loss
plt.plot(model_scvi.history['reconstruction_loss_train']['reconstruction_loss_train'], label='train')
#plt.plot(model_scvi.history['reconstruction_loss_validation']['reconstruction_loss_validation'], label='validation')
plt.legend()

### Save

In [None]:
import datetime
file_path = '/mnt/hdd/data/Healthy'
file_base_name = 'Healthy'

In [None]:
directory_path = file_path + '/Models/'
base_name = file_base_name + '_mdata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated'
date = str(datetime.date.today()) + '_'

try:
    covarCat = '_covarCat' + ''.join(' '.join('_'.join(categorical_covariate_keys).split('_')).title().split(' '))
except:
    covarCat = '_covarCatNone'
    
try:
    covarCont = '_covarCont' + ''.join(' '.join('_'.join(continuous_covariate_keys).split('_')).title().split(' '))
except:
    covarCont = '_covarContNone' 

try:
    labels = '_labels' + ''.join(' '.join(''.join(labels_key).split('_')).title().split(' '))
except:
    labels = '_labelsNone'

#deep = '_inject' + str(inject)
layers = '_layers' + str(n_layers)
hidden = '_hidden' + str(n_hidden)
latent = '_latent' + str(n_latent)

model_type = '_scVI'

model_path = ''.join([directory_path,
date,
base_name,
labels,
#covarCat,
#covarCont,
layers,
hidden,
latent,
model_type])

model_path

In [None]:
model_scvi.save(model_path, overwrite=True, save_anndata=True)

### Results

In [None]:
adata.obsm['X_scVI'] = model_scvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, use_rep='X_scVI')
sc.tl.leiden(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, wspace = 0.8, cmap=mymap)

In [None]:
plot_embedding_density_kde(adata=adata, groupby='sample', cmap_kde =mymap)

# scANVI

#### free up memory

In [None]:
del adata_scvi
gc.collect()

In [None]:
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata_scvi.h5ad')

### Setup

In [None]:
adata_scanvi = adata.copy()

In [None]:
model_scanvi = scvi.model.SCANVI.from_scvi_model(
    model_scvi,
    adata=adata_scanvi, 
    labels_key=labels_key, 
    unlabeled_category='unlabelled'
)
print(model_scanvi)
#model_scanvi.view_anndata_setup()

In [None]:
del model_scvi
gc.collect()

### Train

In [None]:
torch.cuda.empty_cache()

In [None]:
model_scanvi.train(max_epochs=500, early_stopping=True)

In [None]:
# plot reconstruction loss
plt.plot(model_scanvi.history['reconstruction_loss_train']['reconstruction_loss_train'], label='train')
plt.plot(model_scanvi.history['reconstruction_loss_validation']['reconstruction_loss_validation'], label='validation')
plt.legend()

### Save

In [None]:
directory_path = file_path + '/Models/'
base_name = file_base_name + '_mdata_markedDoublets_mergedPeaks_normalized_initialAnno_rmDoublets_integrated'
date = str(datetime.date.today()) + '_'

try:
    covarCat = '_covarCat' + ''.join(' '.join('_'.join(categorical_covariate_keys).split('_')).title().split(' '))
except:
    covarCat = '_covarCatNone'
    
try:
    covarCont = '_covarCont' + ''.join(' '.join('_'.join(continuous_covariate_keys).split('_')).title().split(' '))
except:
    covarCont = '_covarContNone' 

try:
    labels = '_labels' + ''.join(' '.join(''.join(labels_key).split('_')).title().split(' '))
except:
    labels = '_labelsNone'

#deep = '_inject' + str(inject)
layers = '_layers' + str(n_layers)
hidden = '_hidden' + str(n_hidden)
latent = '_latent' + str(n_latent)

model_type = '_scANVI'

model_path = ''.join([directory_path,
date,
base_name,
labels,
#covarCat,
#covarCont,
layers,
hidden,
latent,
model_type])

model_path

In [None]:
model_scanvi.save(model_path, overwrite=True, save_anndata=True)

### Results

In [None]:
adata.obsm['X_scANVI'] = model_scanvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, use_rep='X_scANVI')
sc.tl.leiden(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, cmap=mymap,wspace=0.85)

In [None]:
plot_embedding_density_kde(adata=adata, groupby='sample', cmap_kde= mymap)

In [None]:
del adata_scanvi
del model_scanvi
gc.collect()

In [None]:
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata_scvi_scanvi.h5ad')

In [None]:
adata = sc.read_h5ad('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata_scvi_scanvi.h5ad')

# Harmony

In [None]:
adata_harmony = adata.copy()

In [None]:
sc.external.pp.harmony_integrate(adata_harmony, key='sample', adjusted_basis='X_harmony')

In [None]:
adata.obsm['X_harmony'] = adata_harmony.obsm['X_harmony']

In [None]:
sc.pp.neighbors(adata, use_rep='X_harmony')
sc.tl.leiden(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, wspace= 0.95,cmap = mymap)

In [None]:
plot_embedding_density_kde(adata=adata, groupby='sample', cmap_kde = mymap)

# Benchmark

In [None]:
from scib_metrics.benchmark import Benchmarker
sc.set_figure_params(scanpy=True, frameon=False, vector_friendly=False ,transparent=True, dpi=150, dpi_save=300)

In [None]:
bm = Benchmarker(
    adata,
    batch_key='sample',
    label_key='initial_cell_type',
    embedding_obsm_keys=['X_MultiVI_rmDoublets_meta','X_MultiVI_meta','X_scVI', 'X_scANVI', 'X_harmony', 'X_pca'],
    n_jobs=20,
)
bm.benchmark()

In [None]:
bm.plot_results_table(min_max_scale=False)

In [None]:
bm.plot_results_table(min_max_scale=True)

# save adata

In [None]:
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_rmDoublets_integrated.h5ad')