# SERPENTINE Integration Methods

## Set Up Environment

In [None]:
# Load libraries

# Python packages
import numpy as np
import scanpy as sc
import scvi
import bbknn
import scib
import harmonypy
# import scgen

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

# R interface
from rpy2.robjects import pandas2ri
from rpy2.robjects import r
import rpy2.rinterface_lib.callbacks
import anndata2ri

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

#supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
_stderr = sys.stderr
null = open(os.devnull,'wb')

In [None]:
import numpy as np 
np.version.version

In [None]:
# set up working directory
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE/"

In [None]:
# set up figures directory"
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "integration", "by_subproject/")
sc.set_figure_params(dpi = 600, dpi_save=600)

In [None]:
n_pcs=50

## Integration Methods

In [None]:
# import adata object
adata_hvg = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_HVG_22-02-24.h5ad"))

In [None]:
# create subproject column
adata_hvg.obs['subproject'] = adata_hvg.obs['project'].astype(str) + "_P" + adata_hvg.obs['patient'].astype(str) + "_" + adata_hvg.obs['timepoint'].astype(str)
adata_hvg.obs["subproject"]=adata_hvg.obs["subproject"].astype("category")

In [None]:
# define integration vars
label_key = "Annotation_2.0"
batch_key = "subproject"

In [None]:
sc.pl.umap(adata_hvg, color=["Annotation_1.0", "project", batch_key, "timepoint"], wspace=1)

In [None]:
sc.pl.umap(adata_hvg, color="Annotation_1.0", show=False, save="Combined_unintegrated_Annotation_1.0_umap.png")
sc.pl.umap(adata_hvg, color="Annotation_2.0", show=False, save="Combined_unintegrated_Annotation_2.0_umap.png")
sc.pl.umap(adata_hvg, color="project", show=False, save="Combined_unintegrated_project_umap.png")
sc.pl.umap(adata_hvg, color="subproject", show=False, save="Combined_unintegrated_subproject_umap.png")
sc.pl.umap(adata_hvg, color="timepoint", show=False, save="Combined_unintegrated_timepoint_umap.png")

### scVI

In [None]:
# create object specific to scVI
adata_scvi = adata_hvg.copy()

In [None]:
# prepare object
scvi.model.SCVI.setup_anndata(adata_scvi, layer="counts", batch_key=batch_key)
adata_scvi

In [None]:
# train the model
model_scvi = scvi.model.SCVI(adata_scvi)
model_scvi

In [None]:
# visualize model
model_scvi.view_anndata_setup()

In [None]:
# train the model
max_epochs_scvi = np.min([round((20000 / adata_hvg.n_obs) * 400), 400])
max_epochs_scvi

In [None]:
model_scvi.train()

In [None]:
# extract the embedding
adata_scvi.obsm["X_scVI"] = model_scvi.get_latent_representation()

In [None]:
# batch-corrected visualization
sc.pp.neighbors(adata_scvi, use_rep="X_scVI")
sc.tl.umap(adata_scvi)
adata_scvi

In [None]:
sc.pl.umap(adata_scvi, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_scvi, color="Annotation_1.0", show=False, save="Combined_scvi_integrated-subproject_annotation1.0_umap.png")
sc.pl.umap(adata_scvi, color="Annotation_2.0", show=False, save="Combined_scvi_integrated-subproject_annotation2.0_umap.png")
sc.pl.umap(adata_scvi, color="sample", show=False, save="Combined_scvi_integrated-subproject_sample_id_umap.png")
sc.pl.umap(adata_scvi, color="subproject", show=False, save="Combined_scvi_integrated-subproject_subproject_umap.png")

In [None]:
# save adata object
adata_scvi.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_scVI_integrated-subproject_01-03-24.h5ad"))

### scANVI (cell label = "Annotation_2.0")

In [None]:
# compute modell
# Normally we would need to run scVI first but we have already done that here
# model_scvi = scvi.model.SCVI(adata_scvi) etc.
model_scanvi = scvi.model.SCANVI.from_scvi_model(
    model_scvi, labels_key=label_key, unlabeled_category="unlabelled"
)
print(model_scanvi)
model_scanvi.view_anndata_setup()

In [None]:
# train the model
max_epochs_scanvi = int(np.min([10, np.max([2, round(max_epochs_scvi / 3.0)])]))
model_scanvi.train(max_epochs=max_epochs_scanvi)

In [None]:
# extract latent representation from the model and create a new UMAP embedding
adata_scanvi = adata_scvi.copy()
adata_scanvi.obsm["X_scANVI"] = model_scanvi.get_latent_representation()
sc.pp.neighbors(adata_scanvi, use_rep="X_scANVI")
sc.tl.umap(adata_scanvi)
sc.pl.umap(adata_scanvi, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_scanvi, color="Annotation_1.0", show=False, save="Combined_scanvi_integrated-subproject_anno2.0_annotation1.0_umap.png")
sc.pl.umap(adata_scanvi, color="Annotation_2.0", show=False, save="Combined_scanvi_integrated-subproject_anno2.0_annotation2.0_umap.png")
sc.pl.umap(adata_scanvi, color="sample", show=False, save="Combined_scanvi_integrated-subproject_anno2.0_sample_id_umap.png")
sc.pl.umap(adata_scanvi, color="subproject", show=False, save="Combined_scanvi_integrated-subproject_anno2.0_subproject_umap.png")

In [None]:
# save adata object
adata_scanvi.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_scANVI_anno2.0_integrated-subproject_01-03-24.h5ad"))

### scANVI (cell label = "Annotation_1.0")

In [None]:
# compute modell
# Normally we would need to run scVI first but we have already done that here
# model_scvi = scvi.model.SCVI(adata_scvi) etc.
model_scanvi_2 = scvi.model.SCANVI.from_scvi_model(
    model_scvi, labels_key="Annotation_1.0", unlabeled_category="unlabelled"
)
print(model_scanvi_2)
model_scanvi_2.view_anndata_setup()

In [None]:
# train the model
max_epochs_scanvi_2 = int(np.min([10, np.max([2, round(max_epochs_scvi / 3.0)])]))
model_scanvi_2.train(max_epochs=max_epochs_scanvi_2)

In [None]:
# extract latent representation from the model and create a new UMAP embedding
adata_scanvi_2 = adata_scvi.copy()
adata_scanvi_2.obsm["X_scANVI"] = model_scanvi_2.get_latent_representation()
sc.pp.neighbors(adata_scanvi_2, use_rep="X_scANVI")
sc.tl.umap(adata_scanvi_2)
sc.pl.umap(adata_scanvi_2, color=["Annotation_1.0", batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_scanvi_2, color="Annotation_1.0", show=False, save="Combined_scanvi_integrated-subproject_anno1.0_annotation1.0_umap.png")
sc.pl.umap(adata_scanvi_2, color="Annotation_2.0", show=False, save="Combined_scanvi_integrated-subproject_anno1.0_annotation2.0_umap.png")
sc.pl.umap(adata_scanvi_2, color="sample", show=False, save="Combined_scanvi_integrated-subproject_anno1.0_sample_id_umap.png")
sc.pl.umap(adata_scanvi_2, color="subproject", show=False, save="Combined_scanvi_integrated-subproject_anno1.0_subproject_umap.png")

In [None]:
# save adata object
adata_scanvi_2.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_scANVI_anno1.0_integrated-subproject_01-03-24.h5ad"))

### BBKNN

In [None]:
# select number of neighbors
neighbors_within_batch = 25 if adata_hvg.n_obs > 100000 else 3
neighbors_within_batch

In [None]:
# compute pca on log-normalised counts
adata_bbknn = adata_hvg.copy()
adata_bbknn.X = adata_bbknn.layers["logcounts"].copy()
sc.pp.pca(adata_bbknn)

In [None]:
# run bbknn
bbknn.bbknn(
    adata_bbknn, batch_key=batch_key, neighbors_within_batch=neighbors_within_batch
)
adata_bbknn

In [None]:
# reconstruct UMAP embedding using this new integrated graph
sc.tl.umap(adata_bbknn)
sc.pl.umap(adata_bbknn, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_bbknn, color="Annotation_1.0", show=False, save="Combined_bbknn_integrated-subproject_annotation1.0_umap.png")
sc.pl.umap(adata_bbknn, color="Annotation_2.0", show=False, save="Combined_bbknn_integrated-subproject_annotation2.0_umap.png")
sc.pl.umap(adata_bbknn, color="sample", show=False, save="Combined_bbknn_integrated-subproject_sample_id_umap.png")
sc.pl.umap(adata_bbknn, color="subproject", show=False, save="Combined_bbknn_integrated-subproject_subproject_umap.png")

In [None]:
# save adata object
adata_bbknn.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_BBKNN_integrated-subproject_01-03-24.h5ad"))

### Harmony

In [None]:
# prepare anndata object
adata_harmony = adata_hvg.copy()
adata_harmony

In [None]:
# run pca again
sc.tl.pca(adata_harmony)

In [None]:
sc.external.pp.harmony_integrate(adata_harmony, batch_key)

In [None]:
adata_harmony

In [None]:
sc.pp.neighbors(adata_harmony, n_pcs=n_pcs, use_rep="X_pca_harmony")
sc.tl.umap(adata_harmony)

In [None]:
sc.pl.umap(adata_harmony, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_harmony, color="Annotation_1.0", show=False, save="Combined_harmony_integrated-subproject_annotation1.0_umap.png")
sc.pl.umap(adata_harmony, color="Annotation_2.0", show=False, save="Combined_harmony_integrated-subproject_annotation2.0_umap.png")
sc.pl.umap(adata_harmony, color="sample", show=False, save="Combined_harmony_integrated-subproject_sample_id_umap.png")
sc.pl.umap(adata_harmony, color="subproject", show=False, save="Combined_harmony_integrated-subproject_subproject_umap.png")

In [None]:
# save adata object
adata_harmony.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_harmony_integrated-subproject_01-03-24.h5ad"))

### MNN

In [None]:
adata_seurat = adata_hvg.copy()

In [None]:
adata_seurat_list = [adata_seurat[adata_seurat.obs[batch_key] == i] for i in adata_seurat.obs[batch_key].unique()]


In [None]:
import mnnpy
adata_seurat = sc.external.pp.mnn_correct(*adata_seurat_list, batch_key = batch_key) 

In [None]:
adata_seurat_full = adata_seurat
adata_seurat = adata_seurat_full[0]

In [None]:
sc.tl.pca(adata_seurat)
sc.pp.neighbors(adata_seurat, n_pcs=n_pcs)
sc.tl.umap(adata_seurat)
sc.pl.umap(adata_seurat, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_seurat, color="Annotation_1.0", show=False, save="Combined_mnn_integrated-subproject_annotation1.0_umap.png")
sc.pl.umap(adata_seurat, color="Annotation_2.0", show=False, save="Combined_mnn_integrated-subproject_annotation2.0_umap.png")
sc.pl.umap(adata_seurat, color="sample", show=False, save="Combined_mnn_integrated-subproject_sample_id_umap.png")
sc.pl.umap(adata_seurat, color="subproject", show=False, save="Combined_mnn_integrated-subproject_subproject_umap.png")

In [None]:
# save adata object
adata_seurat.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_MNN_integrated-subproject_01-03-24.h5ad"))

### Scanorama

In [None]:
adata_scanorama = adata_hvg.copy()
# sort cells by batches as required by scanorama
idx = adata_scanorama.obs.sort_values(batch_key).index
adata_scanorama = adata_scanorama[idx,]

In [None]:
adata_scanorama_split  = []

for batch in adata_scanorama.obs[batch_key].unique():
    adata_scanorama_split.append(adata_scanorama[adata_scanorama.obs[batch_key]==batch].copy())

In [None]:
import scanorama
# Now we run Scanorama on the split data.
corrected = scanorama.correct_scanpy(adata_scanorama_split, return_dimred=True)

# Merge the corrected datasets
adata_scanorama_corr = corrected[0].concatenate(corrected[1:])
adata_scanorama_corr.obs_names_make_unique(join='_')

In [None]:
sc.pp.neighbors(adata_scanorama_corr, n_pcs=n_pcs, use_rep="X_scanorama")
sc.tl.umap(adata_scanorama_corr)
sc.pl.umap(adata_scanorama_corr, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_scanorama_corr, color="Annotation_1.0", show=False, save="Combined_scanorama_integrated-subproject_annotation1.0_umap.png")
sc.pl.umap(adata_scanorama_corr, color="Annotation_2.0", show=False, save="Combined_scanorama_integrated-subproject_annotation2.0_umap.png")
sc.pl.umap(adata_scanorama_corr, color="sample", show=False, save="Combined_scanorama_integrated-subproject_sample_id_umap.png")
sc.pl.umap(adata_scanorama_corr, color="subproject", show=False, save="Combined_scanorama_integrated-subproject_subproject_umap.png")

In [None]:
# save adata object
adata_scanorama_corr.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_scanorama_integrated-subproject_01-03-24.h5ad"))