In [None]:
"""
Example Code for Running scANVI

Code was developed for scvi-tools version 0.9.3,
but should work for versions up to 0.14.6. From
versions >0.15.0, the setup_anndata function has
been slightly altered (see scvi-tools tutorials).
"""

In [None]:
#%% scANVI: load important modules

import torch
import scvi
import scanpy
import sys
import csv
import numpy as np
import pandas as pd

In [None]:
#%% scANVI: define paths to data files

"""
variable_genes_file = path_to_variable_genes

A .csv file with two columns corresponding to gene names and
their indices within the dataset (see variable_genes.csv files 
for example). We recommend using variable genes defined by
Seurat's VST algorithm. NOTE: If managing data in both R and
Python, be sure to account for differences in indexing between
the languages (e.g., Gene 1 in R would map to Gene 0 in Python).

dataset_file = path_to_dataset

A .csv file of the expression matrix, with genes as rows and
cells as columns (standard format in Seurat). Gene names and
cell barcodes should be included. If your dataset contains cells
as rows and genes as columns (standard format in Scanpy), then
do not tranpose the adata object in the "scVI: create AnnData"
cell. Alternatively, if your data is in the .h5 or .mtx formats
(common outputs from CellRanger), you should use the appropriate
Scanpy data-reading functions (e.g., scanpy.read_10x_h5 or
scanpy.read_10x_mtx) in the "scVI: create AnnData" cell.

batch_id_file = path_to_batch_file

A .csv file with a single column corresponding to the batch
variable for each cell. This file should not have headers nor
row / column names.

scANVI_label_path = path_to_input_labels

A .csv files with a single column corresponding to the identities
for each cell. Cells that are un-annotated should be labeled
"Unknown". This file should not have headers nor row / column names.
"""

In [None]:
#%% scANVI: variable genes

#retrieve highly variable genes
var_gene_index = []
var_gene_name = []

with open(variable_genes_file) as csvfile:
    readCSV = csv.reader(csvfile, delimiter = ",")
    for row in readCSV:
        var_gene_index.append(int(row[1]))
        var_gene_name.append(row[0])
        
print("Number of variable genes: " + str(len(var_gene_name)))

In [None]:
#%% scANVI: batch identities

#retrieve batch identities
batch_identities = np.loadtxt(batch_id_file, delimiter=",", dtype = np.float64)
batch_identities = np.reshape(batch_identities, (len(batch_identities), 1))
print("Batch Identities: " + str(np.unique(batch_identities)))

In [None]:
#%% scANVI: input cell labels

#retrieve scANVI input cell labels
scANVI_input_labels = pd.read_csv(scANVI_label_path, sep = ",", names = ["labels"]).labels.to_list()
print("scANVI Input Labels: \n" + str(np.unique(scANVI_input_labels, return_counts = True)))

In [None]:
#%% scANVI: output files

directory_path = "./" #folder location where you want to save scANVI outputs

#define the output files
model_save_file = directory_path + "scANVI_model"
adata_save_file = directory_path + "scANVI_adata.h5ad"
latent_save_file = directory_path + "scANVI_latent.csv"
normalized_save_file = directory_path + "scANVI_normalized.csv"
imputed_save_file = directory_path + "scANVI_imputed.csv"
label_transfer_save_file = directory_path + "scANVI_label_transfer.csv"
label_probabilities_save_file = directory_path + "scANVI_label_probabilities.csv"

print("Model Save Path:\n " + model_save_file,
      "AnnData Save Path:\n " + adata_save_file,
      "Latent Save Path:\n " + latent_save_file,
      "Normalized Save Path:\n " + normalized_save_file,
      "Imputed Save Path:\n " + imputed_save_file,
      "Label Transfer Save Path:\n " + label_transfer_save_file,
      "Label Probabilities Save Path:\n " + label_probabilities_save_file,
      sep="\n\n")

In [None]:
#%% scANVI: create AnnData

#create Scanpy AnnData object

#we use the Scanpy's read_csv function to create the AnnData object,
#but scanpy.read_10x_h5 or scanpy.read_10x_mtx can also work here.
#writing and loading larger datasets as .csv files can be very slow,
#so these alternatives functions will certainly be faster.

adata = scanpy.read_csv(dataset_file, first_column_names=True)
adata = adata.transpose() #do not transpose if cells are rows and genes are columns
adata = adata[:, var_gene_name].copy() #subset the dataset to only include variable genes

#add batch identities and scANVI input labels to AnnData object
adata.obs["batch"] = batch_identities
adata.obs["scANVI_input_labels"] = scANVI_input_labels

print(adata)

print("Batch count:")
print(*[sum(adata.obs["batch"] == i) for i in set(adata.obs["batch"])])
for i in set(adata.obs["batch"]):
    print(i, sum(adata.obs["batch"] == i))
    
print(np.unique(adata.obs["scANVI_input_labels"], return_counts = True))

In [None]:
#%% scANVI: specify model parameters (unsupervised + semisupervised)

#parameters used in the Worley, Everetts, et al. paper
#can be altered to user's preference
scvi_params = {"use_cuda" : torch.cuda.is_available(),
               "n_layers" : 2,
               "n_latent" : n_latent,
               "gene_likelihood" : "nb",
               "scVI_n_epochs_unsupervised" : 400,
               "scANVI_n_epochs_semisupervised" : 50,
               "train_size" : 0.8,
               "python" : sys.executable,
               "scvi_version:" : scvi.__version__,
               "scANVI_n_samples_per_label" : 150,
               "scANVI_input_labels_path" : scANVI_label_path}

#Notes about parameters used:
#We first train an scVI model to the data without any label information.
#See "scVI_n_epochs_unsupervised" value.
#This allows the model to initially learn a set of unsupervised weights for the data.
#These unsupervised weights are then used to initialize a scANVI model.
#The scANVI model is trained on the dataset with input labels (semisupervised).
#See "scANVI_n_epochs_semisupervised" value.
#If certain lables have a low number of cells, they might be eliminated from scANVI's predictions.
#Decreasing the "scANVI_n_samples_per_label" value can prevent labels from being dropped.

for key, val in scvi_params.items():
    print(key, val, sep="\n")

In [None]:
#%% scANVI: create the model (unsupervised + semisupervised)

#the following setup_anndata function should work for scvi-tools versions <0.14
#for scvi-tools versions >0.15, scvi.model.SCVI.setup_anndata should suffice
#instead of scvi.data.setup_anndata

scvi.data.setup_anndata(adata,
                        batch_key = "batch",
                        labels_key = "scANVI_input_labels")
scVI_model = scvi.model.SCVI(adata,
                             n_latent = scvi_params["n_latent"],
                             n_layers = scvi_params["n_layers"],
                             gene_likelihood = scvi_params["gene_likelihood"])

In [None]:
#%% scANVI: train the model (unsupervised + semisupervised)

#train the scVI model without input labels (unsupervised training)
scVI_model.train(max_epochs = scvi_params["scVI_n_epochs_unsupervised"],
                 train_size = scvi_params["train_size"])

#initialize a scANVI model using the scVI model weights
scANVI_model = scvi.model.SCANVI.from_scvi_model(scVI_model,
                                                 unlabeled_category = "Unknown",
                                                 adata = adata)

#train the scANVI model with input labels (semisupervised training)
scANVI_model.train(max_epochs = scvi_params["scANVI_n_epochs_semisupervised"],
                   train_size = scvi_params["train_size"],
                   n_samples_per_label = scvi_params["scANVI_n_samples_per_label"])

adata.obs["scANVI_predict"] = scANVI_model.predict(adata)
model_latent = scANVI_model.get_latent_representation()
model_normalized = scANVI_model.get_normalized_expression()
model_imputed = scANVI_model.get_normalized_expression(library_size = "latent")
model_predictions = scANVI_model.predict(adata, soft = False)
model_predict_prob = scANVI_model.predict(adata, soft = True)

#save all of the output to file
scANVI_model.save(dir_path = model_save_file)
adata.write(filename = adata_save_file)
np.savetxt(latent_save_file, model_latent, fmt='%s', delimiter = ",")
np.savetxt(normalized_save_file, model_normalized, fmt='%s', delimiter = ",")
np.savetxt(imputed_save_file, model_imputed, fmt='%s', delimiter = ",")
np.savetxt(label_transfer_save_file, model_predictions, fmt='%s', delimiter = ",")
np.savetxt(label_probabilities_save_file, model_predict_prob, fmt='%s', delimiter = ",")
#Outputs:
#model_save_file: folder containing trained scANVI model
#adata_save_file: AnnData used for model training, in .h5ad format
#latent_save_file: CSV file containing the latent representation (matrix) of the data
#normalized_save_file: CSV file containing the denoised expression matrix, scaled to 1
#imputed_save_file: CSV file containing the denoised expression matrix, scaled to library size
#label_probabilities_save_file: CSV file containing the prediction probabilities
#for each label on each cell
#label_transfer_save_file: CSV file containing only the predicted labels for each cell
#which corresponds to the label with the maximum prediction probability

#save a log of parameters used
output_log = open(directory_path + "scANVI_param_log.txt", "w")
for key, val in scvi_params.items():
    output_log.write(key + "\t" + str(val) + "\n")
output_log.close()