
# Preprocessing - Integration
Michael Sterr

2024-05-18 


# Setup


In [None]:
# General
import scipy as sci
import numpy as np
import pandas as pd
import logging
import time
import pickle
from itertools import chain
import session_info
import gc # Free memory #gc.collect()
import scipy.stats as stats

# Plotting
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import rcParams
from matplotlib.pyplot import rc_context
from matplotlib import cm
import seaborn as sb

# Analysis
import scvi
import torch
import scanpy as sc
import anndata as ad

In [None]:
# Settings

import warnings
warnings.filterwarnings("ignore")

## Directory
base_dir = '/mnt/hdd/'
data_dir = 'data/Healthy/'
nb_dir = 'Notebooks/Gut_project/'
sc.settings.figdir = base_dir + nb_dir + 'Figures'
sc.settings.cachedir = base_dir + 'Cache'

## Scanpy settings
sc.settings.verbosity = 3
sc.logging.print_versions()
session_info.show()

In [None]:
%run utils.ipynb

In [None]:
mymap = load_RdOrYl_cmap_settings()

# Setup R

In [None]:
#R
import rpy2
import rpy2.robjects as ro
import rpy2.rinterface_lib.callbacks
from rpy2.robjects import pandas2ri
import anndata2ri
setup_R('/home/scanalysis/mnt/envs/scUV/lib/R')

In [None]:
%%R

.libPaths()

In [None]:
%%R
library(scry)

# Parallelization
library(BiocParallel)
register(MulticoreParam(20, progressbar = TRUE))

library(future)
plan(multicore, workers = 20)
options(future.globals.maxSize = 64 * 1024 ^ 3) # for 50 Gb RAM
plan()

#library(doParallel)
#registerDoParallel(20)

sessionInfo()

# Load Data

In [None]:
adata = sc.read_h5ad('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata.h5ad')
adata.obs = adata.obs.astype({'enrichment proportion': str})
gc.collect()

In [None]:
torch.cuda.empty_cache()

## SCVI

### Setup

In [None]:
#join covariates for scarches
adata.obs['covariates'] = adata.obs['sample']
adata.obs['covariates'] = adata.obs.apply(lambda row: '_'.join([row['sample'], row['kit']]), axis=1)


In [None]:
n_hidden=512
n_latent=50
n_layers=2

batch_key = 'covariates'
labels_key = 'initial_cell_type'

#categorical_covariate_keys = ['kit']
continuous_covariate_keys = None

layer = 'raw_counts'

In [None]:
scvi.model.SCVI.setup_anndata(adata, layer=layer, batch_key=batch_key, labels_key=labels_key, continuous_covariate_keys=continuous_covariate_keys)

In [None]:
model_scvi = scvi.model.SCVI(adata, n_hidden=n_hidden, n_layers=n_layers, n_latent=n_latent, gene_likelihood='nb', dispersion='gene-batch')
print(model_scvi)
#model_scvi.view_anndata_setup()

### Train

In [None]:
torch.cuda.empty_cache()

In [None]:
model_scvi.train(max_epochs=1000, early_stopping=True)

In [None]:
# plot reconstruction loss
plt.plot(model_scvi.history['reconstruction_loss_train']['reconstruction_loss_train'], label='train')
#plt.plot(model_scvi.history['reconstruction_loss_validation']['reconstruction_loss_validation'], label='validation')
plt.legend()

### Save

In [None]:
import datetime
file_path = '/mnt/hdd/data/Healthy'
file_base_name = 'Healthy'

In [None]:
directory_path = file_path + '/Models/'
base_name = file_base_name + '_mdata_markedDoublets_normalized_initialAnno_rmDoublets_integrated'
date = str(datetime.date.today()) + '_'

try:
    covarCat = '_covarCat' + ''.join(' '.join('_'.join(categorical_covariate_keys).split('_')).title().split(' '))
except:
    covarCat = '_covarCatNone'
    
try:
    covarCont = '_covarCont' + ''.join(' '.join('_'.join(continuous_covariate_keys).split('_')).title().split(' '))
except:
    covarCont = '_covarContNone' 

try:
    labels = '_labels' + ''.join(' '.join(''.join(labels_key).split('_')).title().split(' '))
except:
    labels = '_labelsNone'

#deep = '_inject' + str(inject)
layers = '_layers' + str(n_layers)
hidden = '_hidden' + str(n_hidden)
latent = '_latent' + str(n_latent)

model_type = '_scVI'

model_path = ''.join([directory_path,
date,
base_name,
labels,
#covarCat,
#covarCont,
layers,
hidden,
latent,
model_type])

model_path

In [None]:
model_scvi.save(model_path, overwrite=True, save_anndata=True)

### Results

In [None]:
adata.obsm['X_scVI'] = model_scvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, use_rep='X_scVI')
sc.tl.leiden(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
marker_genes = ['Neurog3','Tph1','Isl1','Pou2f3', 'Sox9','Lgr5','Dmbt1','Hmgb2','Top2a','Defa24','Gna11','Cd52','Muc2','Fcgbp','Lyz1']

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, wspace = 0.8, cmap=mymap)

# scANVI

#### free up memory

In [None]:
del adata_scvi
gc.collect()

In [None]:
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata_scvi.h5ad')

### Setup

In [None]:
model_scanvi = scvi.model.SCANVI.from_scvi_model(
    model_scvi,
    adata=adata, 
    labels_key=labels_key, 
    unlabeled_category='unlabelled'
)
print(model_scanvi)
#model_scanvi.view_anndata_setup()

In [None]:
del model_scvi
gc.collect()

### Train

In [None]:
torch.cuda.empty_cache()

In [None]:
model_scanvi.train(max_epochs=500, early_stopping=True)

In [None]:
# plot reconstruction loss
plt.plot(model_scanvi.history['reconstruction_loss_train']['reconstruction_loss_train'], label='train')
plt.plot(model_scanvi.history['reconstruction_loss_validation']['reconstruction_loss_validation'], label='validation')
plt.legend()

### Save

In [None]:
directory_path = file_path + '/Models/'
base_name = file_base_name + '_mdata_markedDoublets_normalized_initialAnno_rmDoublets_integrated'
date = str(datetime.date.today()) + '_'

try:
    covarCat = '_covarCat' + ''.join(' '.join('_'.join(categorical_covariate_keys).split('_')).title().split(' '))
except:
    covarCat = '_covarCatNone'
    
try:
    covarCont = '_covarCont' + ''.join(' '.join('_'.join(continuous_covariate_keys).split('_')).title().split(' '))
except:
    covarCont = '_covarContNone' 

try:
    labels = '_labels' + ''.join(' '.join(''.join(labels_key).split('_')).title().split(' '))
except:
    labels = '_labelsNone'

#deep = '_inject' + str(inject)
layers = '_layers' + str(n_layers)
hidden = '_hidden' + str(n_hidden)
latent = '_latent' + str(n_latent)

model_type = '_scANVI'

model_path = ''.join([directory_path,
date,
base_name,
labels,
#covarCat,
#covarCont,
layers,
hidden,
latent,
model_type])

model_path

In [None]:
model_scanvi.save(model_path, overwrite=True, save_anndata=True)

### Results

In [None]:
adata.obsm['X_scANVI'] = model_scanvi.get_latent_representation()

In [None]:
sc.pp.neighbors(adata, use_rep='X_scANVI')
sc.tl.leiden(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['sample','initial_cell_type','leiden'] + marker_genes, size=10, add_outline=True, alpha=1, outline_width=(0.3, 0.0), ncols=3, cmap=mymap,wspace=0.9)

In [None]:
plot_embedding_density_kde(adata=adata, groupby='sample', cmap_kde= mymap)

In [None]:
del adata_scanvi
del model_scanvi
gc.collect()

In [None]:
adata.write('/mnt/hdd/data/Healthy/adata_markedDoublets_normalized_initialAnno_noimmune_multivi_orig_wodblts_metadata_scvi_scanvi.h5ad')