# SERPENTINE Integration Methods

## Set Up Environment

In [None]:
# Load libraries

# Python packages
import numpy as np
import scanpy as sc
import scvi
import bbknn
import scib
import harmonypy
# import scgen

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

# R interface
from rpy2.robjects import pandas2ri
from rpy2.robjects import r
import rpy2.rinterface_lib.callbacks
import anndata2ri

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

#supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
_stderr = sys.stderr
null = open(os.devnull,'wb')

In [None]:
import numpy as np 
np.version.version

In [None]:
# set up working directory
work_dir = "/scratch_isilon/groups/singlecell/gdeuner/SERPENTINE/"

In [None]:
# set up figures directory
sc.settings.figdir = os.path.join(work_dir, "figures", "combined", "integration", "by_sample/")

sc.set_figure_params(dpi = 600, dpi_save=600)

In [None]:
# define integration vars
label_key = "Annotation_2.0"
batch_key = "sample"

In [None]:
# import data
adata = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "Combined_SCR_CO2_annotated_2.0_TCR_14-02-24.h5ad"))

In [None]:
adata_raw = adata

In [None]:
## subset for testing
#subset_cells = np.random.choice(adata_full.obs_names, size=1000, replace=False)
#adata = adata[subset_cells, :].copy()

## Preprocessing

In [None]:
# remove NOISE clusters from previous patient-specific cell type annotation
adata = adata[adata.obs["Annotation_2.0"] != "NOISE"]

In [None]:
# store raw counts
adata.layers["counts"] = adata.raw.X

In [None]:
# normalization
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)

In [None]:
# log transform the data.
sc.pp.log1p(adata)

In [None]:
# score log counts 
adata.layers["logcounts"] = adata.X

In [None]:
# identify highly variable genes.
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.tl.pca(adata)

In [None]:
# select optimal number of PCs
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50)

In [None]:
n_pcs=50

In [None]:
sc.pp.neighbors(adata, n_pcs=n_pcs)

In [None]:
sc.tl.umap(adata)

### Preparation

In [None]:
# standardize annotations
adata.obs["Annotation_2.0"].unique()

In [None]:
adata.obs['Annotation_2.0'] = adata.obs['Annotation_2.0'].replace({"B cells": "B Cells"})
adata.obs['Annotation_2.0'] = adata.obs['Annotation_2.0'].replace({"Plasma cells": "Plasma Cells"})
adata.obs['Annotation_2.0'] = adata.obs['Annotation_2.0'].replace({"MAST": "Mast"})

In [None]:
adata.obs["Annotation_1.0"].unique()

In [None]:
adata.obs['Annotation_1.0'] = adata.obs['Annotation_1.0'].replace({"B cells": "B Cells"})
adata.obs['Annotation_1.0'] = adata.obs['Annotation_1.0'].replace({"T cells": "T Cells"})
adata.obs['Annotation_1.0'] = adata.obs['Annotation_1.0'].replace({"Plasma cells": "Plasma Cells"})

In [None]:
# adapt format so scIB works
adata.obs["Annotation_2.0"]=adata.obs["Annotation_2.0"].astype("category")
adata.obs["Annotation_1.0"]=adata.obs["Annotation_1.0"].astype("category")
adata.obs["sample"]=adata.obs["sample"].astype("category")

In [None]:
adata.__dict__['_raw'].__dict__['_var'] = adata.__dict__['_raw'].__dict__['_var'].rename(columns={'_index': 'features'})

In [None]:
# standardize some metadata columns
adata.obs['patient'] = adata.obs['patient'].replace({"P08": "08"})
adata.obs['timepoint'] = adata.obs['timepoint'].replace({"C2": "C02"})

In [None]:
adata.obs[batch_key].value_counts()

In [None]:
sc.pl.umap(adata, color=["Annotation_1.0", batch_key], wspace=1)

In [None]:
sc.pl.umap(adata, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata, color="Annotation_1.0", show=False, save="Combined_unintegrated_annotation1.0_umap.png")
sc.pl.umap(adata, color="Annotation_2.0", show=False, save="Combined_unintegrated_annotation2.0_umap.png")
sc.pl.umap(adata, color="sample", show=False, save="Combined_unintegrated_sample_id_umap.png")

In [None]:
# identify highly variable genes.
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5, batch_key="sample")

In [None]:
n_batches = adata.var["highly_variable_nbatches"].value_counts()
ax = n_batches.plot(kind="bar")
n_batches

In [None]:
# object with just the HVG
adata_hvg = adata[:, adata.var["highly_variable"]].copy()
adata_hvg

In [None]:
# save adata object
adata_hvg.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_HVG_22-02-24.h5ad"))

In [None]:
adata_hvg.obs

## Integration Methods

In [None]:
# import adata object
adata_hvg = sc.read_h5ad(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_HVG_22-02-24.h5ad"))

### scVI

In [None]:
# create object specific to scVI
adata_scvi = adata_hvg.copy()

In [None]:
# prepare object
scvi.model.SCVI.setup_anndata(adata_scvi, layer="counts", batch_key=batch_key)
adata_scvi

In [None]:
# train the model
model_scvi = scvi.model.SCVI(adata_scvi)
model_scvi

In [None]:
# visualize model
model_scvi.view_anndata_setup()

In [None]:
# train the model
max_epochs_scvi = np.min([round((20000 / adata.n_obs) * 400), 400])
max_epochs_scvi

In [None]:
model_scvi.train()

In [None]:
# extract the embedding
adata_scvi.obsm["X_scVI"] = model_scvi.get_latent_representation()

In [None]:
# batch-corrected visualization
sc.pp.neighbors(adata_scvi, use_rep="X_scVI")
sc.tl.umap(adata_scvi)
adata_scvi

In [None]:
sc.pl.umap(adata_scvi, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_scvi, color="Annotation_1.0", show=False, save="Combined_scvi_integrated-sample_annotation1.0_umap.png")
sc.pl.umap(adata_scvi, color="Annotation_2.0", show=False, save="Combined_scvi_integrated-sample_annotation2.0_umap.png")
sc.pl.umap(adata_scvi, color="sample", show=False, save="Combined_scvi_integrated-sample_sample_id_umap.png")

In [None]:
# save adata object
adata_scvi.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_scVI_integrated-sample_22-02-24.h5ad"))

### scANVI (cell label = "Annotation_2.0")

In [None]:
# compute modell
# Normally we would need to run scVI first but we have already done that here
# model_scvi = scvi.model.SCVI(adata_scvi) etc.
model_scanvi = scvi.model.SCANVI.from_scvi_model(
    model_scvi, labels_key=label_key, unlabeled_category="unlabelled"
)
print(model_scanvi)
model_scanvi.view_anndata_setup()

In [None]:
# train the model
max_epochs_scanvi = int(np.min([10, np.max([2, round(max_epochs_scvi / 3.0)])]))
model_scanvi.train(max_epochs=max_epochs_scanvi)

In [None]:
# extract latent representation from the model and create a new UMAP embedding
adata_scanvi = adata_scvi.copy()
adata_scanvi.obsm["X_scANVI"] = model_scanvi.get_latent_representation()
sc.pp.neighbors(adata_scanvi, use_rep="X_scANVI")
sc.tl.umap(adata_scanvi)
sc.pl.umap(adata_scanvi, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_scanvi, color="Annotation_1.0", show=False, save="Combined_scanvi_integrated-sample_anno2.0_annotation1.0_umap.png")
sc.pl.umap(adata_scanvi, color="Annotation_2.0", show=False, save="Combined_scanvi_integrated-sample_ann02.0_annotation2.0_umap.png")
sc.pl.umap(adata_scanvi, color="sample", show=False, save="Combined_scanvi_integrated-sample_anno2.0_sample_id_umap.png")

In [None]:
# save adata object
adata_scanvi.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_scANVI_anno2.0_integrated-sample_22-02-24.h5ad"))

### scANVI (cell label = "Annotation_1.0")

In [None]:
# compute modell
# Normally we would need to run scVI first but we have already done that here
# model_scvi = scvi.model.SCVI(adata_scvi) etc.
model_scanvi_2 = scvi.model.SCANVI.from_scvi_model(
    model_scvi, labels_key="Annotation_1.0", unlabeled_category="unlabelled"
)
print(model_scanvi_2)
model_scanvi_2.view_anndata_setup()

In [None]:
# train the model
max_epochs_scanvi_2 = int(np.min([10, np.max([2, round(max_epochs_scvi / 3.0)])]))
model_scanvi_2.train(max_epochs=max_epochs_scanvi_2)

In [None]:
# extract latent representation from the model and create a new UMAP embedding
adata_scanvi_2 = adata_scvi.copy()
adata_scanvi_2.obsm["X_scANVI"] = model_scanvi_2.get_latent_representation()
sc.pp.neighbors(adata_scanvi_2, use_rep="X_scANVI")
sc.tl.umap(adata_scanvi_2)
sc.pl.umap(adata_scanvi_2, color=["Annotation_1.0", batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_scanvi, color="Annotation_1.0", show=False, save="Combined_scanvi_integrated-sample_anno1.0_annotation1.0_umap.png")
sc.pl.umap(adata_scanvi, color="Annotation_2.0", show=False, save="Combined_scanvi_integrated-sample_anno1.0_annotation2.0_umap.png")
sc.pl.umap(adata_scanvi, color="sample", show=False, save="Combined_scanvi_integrated-sample_anno1.0_sample_id_umap.png")

In [None]:
# save adata object
adata_scanvi_2.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_scANVI_anno1.0_integrated-sample_22-02-24.h5ad"))

### BBKNN

In [None]:
# select number of neighbors
neighbors_within_batch = 25 if adata_hvg.n_obs > 100000 else 3
neighbors_within_batch

In [None]:
# compute pca on log-normalised counts
adata_bbknn = adata_hvg.copy()
adata_bbknn.X = adata_bbknn.layers["logcounts"].copy()
sc.pp.pca(adata_bbknn)

In [None]:
# run bbknn
bbknn.bbknn(
    adata_bbknn, batch_key=batch_key, neighbors_within_batch=neighbors_within_batch
)
adata_bbknn

In [None]:
# reconstruct UMAP embedding using this new integrated graph
sc.tl.umap(adata_bbknn)
sc.pl.umap(adata_bbknn, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_bbknn, color="Annotation_1.0", show=False, save="Combined_bbknn_integrated-sample_annotation1.0_umap.png")
sc.pl.umap(adata_bbknn, color="Annotation_2.0", show=False, save="Combined_bbknn_integrated-sample_annotation2.0_umap.png")
sc.pl.umap(adata_bbknn, color="sample", show=False, save="Combined_bbknn_integrated-sample_sample_id_umap.png")

In [None]:
# save adata object
adata_bbknn.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_BBKNN_integrated-sample_22-02-24.h5ad"))

### Harmony

In [None]:
# prepare anndata object
adata_harmony = adata_hvg.copy()
adata_harmony

In [None]:
# run pca again
sc.tl.pca(adata_harmony)

In [None]:
sc.external.pp.harmony_integrate(adata_harmony, batch_key)

In [None]:
adata_harmony

In [None]:
sc.pp.neighbors(adata_harmony, n_pcs=n_pcs, use_rep="X_pca_harmony")
sc.tl.umap(adata_harmony)

In [None]:
sc.pl.umap(adata_harmony, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_harmony, color="Annotation_1.0", show=False, save="Combined_harmony_integrated-sample_annotation1.0_umap.png")
sc.pl.umap(adata_harmony, color="Annotation_2.0", show=False, save="Combined_harmony_integrated-sample_annotation2.0_umap.png")
sc.pl.umap(adata_harmony, color="sample", show=False, save="Combined_harmony_integrated-sample_sample_id_umap.png")

In [None]:
# save adata object
adata_harmony.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_harmony_integrated-sample_22-02-24.h5ad"))

### MNN

In [None]:
adata_seurat = adata_hvg.copy()

In [None]:
adata_seurat.obs["sample_copy"] = adata_seurat.obs["sample"]

In [None]:
adata_seurat.obs

In [None]:
adata_seurat_list = [adata_seurat[adata_seurat.obs[batch_key] == i] for i in adata_seurat.obs[batch_key].unique()]

In [None]:
import mnnpy
adata_seurat = sc.external.pp.mnn_correct(*adata_seurat_list, batch_key = batch_key) 

In [None]:
adata_seurat_full = adata_seurat
adata_seurat = adata_seurat_full[0]

In [None]:
# correct sample column
adata_seurat.obs["sample"] = adata_seurat.obs["sample_copy"]

In [None]:
adata_seurat.obs

In [None]:
sc.tl.pca(adata_seurat)
sc.pp.neighbors(adata_seurat, n_pcs=n_pcs)
sc.tl.umap(adata_seurat)
sc.pl.umap(adata_seurat, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_seurat, color="Annotation_1.0", show=False, save="Combined_mnn_integrated-sample_annotation1.0_umap.png")
sc.pl.umap(adata_seurat, color="Annotation_2.0", show=False, save="Combined_mnn_integrated-sample_annotation2.0_umap.png")
sc.pl.umap(adata_seurat, color="sample", show=False, save="Combined_mnn_integrated-sample_sample_id_umap.png")

In [None]:
# save adata object
adata_seurat.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_MNN_integrated-sample_22-02-24.h5ad"))

### Scanorama

In [None]:
adata_scanorama = adata_hvg.copy()
# sort cells by batches as required by scanorama
idx = adata_scanorama.obs.sort_values(batch_key).index
adata_scanorama = adata_scanorama[idx,]

In [None]:
adata_scanorama_split  = []

for batch in adata_scanorama.obs[batch_key].unique():
    adata_scanorama_split.append(adata_scanorama[adata_scanorama.obs[batch_key]==batch].copy())

In [None]:
import scanorama
# Now we run Scanorama on the split data.
corrected = scanorama.correct_scanpy(adata_scanorama_split, return_dimred=True)

# Merge the corrected datasets
adata_scanorama_corr = corrected[0].concatenate(corrected[1:])
adata_scanorama_corr.obs_names_make_unique(join='_')

In [None]:
sc.pp.neighbors(adata_scanorama_corr, n_pcs=n_pcs, use_rep="X_scanorama")
sc.tl.umap(adata_scanorama_corr)
sc.pl.umap(adata_scanorama_corr, color=[label_key, batch_key], wspace=1)

In [None]:
# save plots
sc.pl.umap(adata_scanorama_corr, color="Annotation_1.0", show=False, save="Combined_scanorama_integrated-sample_annotation1.0_umap.png")
sc.pl.umap(adata_scanorama_corr, color="Annotation_2.0", show=False, save="Combined_scanorama_integrated-sample_annotation2.0_umap.png")
sc.pl.umap(adata_scanorama_corr, color="sample", show=False, save="Combined_scanorama_integrated-sample_sample_id_umap.png")

In [None]:
# save adata object
adata_scanorama_corr.write(os.path.join(work_dir, "data", "outputdata", "combined", "Combined_SCR_CO2_annotated_2.0_TCR_scanorama_integrated-sample_22-02-24.h5ad"))