Following tutorial : https://scanpy-tutorials.readthedocs.io/en/latest/pbmc3k.html

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

In [None]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

In [None]:
aa1data = sc.read_10x_mtx(
    '../data/Huetal2022/AA_patient_1/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading


In [None]:
aa1data.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

In [None]:
aa1data.obs

In [None]:
aa2data = sc.read_10x_mtx(
    '../data/Huetal2022/AA_patient_2/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)

aa2data.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`   
aa2data 
aa2data.obs

In [None]:
n1data = sc.read_10x_mtx(
    '../data/Huetal2022/N_patient_1/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)

n1data.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`   
n1data.obs

In [None]:
n2data = sc.read_10x_mtx(
    '../data/Huetal2022/N_patient_2/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)

n2data.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`   
n2data 

In [None]:
total = ad.concat([aa1data, aa2data, n1data, n2data], merge="same", label="dataset")
total.var_names_make_unique()
total.obs

In [None]:
celltypes = sc.read_text('../data/Huetal2022/celltyping_results.txt', first_column_names=True, delimiter='	', dtype=str)
# celltypes.obs

In [None]:
total = ad.concat([total, celltypes], axis=1)

In [None]:
sc.pp.filter_cells(total, min_genes=200)
sc.pp.filter_genes(total, min_cells=3)

In [None]:
total

In [None]:
total.layers["counts"] = total.X.copy() #this is needed for scvi later

In [None]:
sc.pp.normalize_total(total, target_sum=1e4)

In [None]:
sc.pp.log1p(total)

In [None]:
sc.pp.highly_variable_genes(total, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(total)

In [None]:
total.raw = total

In [None]:
total = total[:, total.var.highly_variable]

In [None]:
sc.pp.scale(adata, max_value=10)

In [None]:
sc.tl.pca(total, svd_solver='arpack')

In [None]:
sc.pl.pca(total, color='dataset')

In [None]:
sc.pl.pca_variance_ratio(total, log=False)

In [None]:
sc.pp.neighbors(total, n_neighbors=10, n_pcs=50)

In [None]:
sc.tl.umap(total)

In [None]:
sc.pl.umap(total, color=['dataset'])

---

SCVI on total as count matrix

In [None]:
import scvi

In [None]:
scvi.model.SCVI.setup_anndata(total, layer="counts", categorical_covariate_keys=["dataset"])

In [None]:
vae = scvi.model.SCVI(total)
vae

In [None]:
vae.train()

In [None]:
vae

In [None]:
latent = vae.get_latent_representation()
total.obsm["X_scvi"] = latent

In [None]:
sc.pp.neighbors(total, use_rep="X_scvi")
total.obsp["distances"]

In [None]:
sc.tl.umap(total)
sc.pl.umap(total, color=['dataset'])

In [None]:
total.obsm["X_normalized_scVI"] = vae.get_normalized_expression()
total.obsm["X_normalized_scVI"]

Differential expression between individuals

In [None]:
diffexpdf = vae.differential_expression(total, )
diffexpdf

In [None]:
vae.save("vae_SCVI_trained_with_donor_as_covariate")

In [None]:
aa_subset = ad.concat([total[total.obs.dataset == "0"], total[total.obs.dataset == "1"]])
aa_subset
latent_aa_subset = vae.get_latent_representation(aa_subset)

In [None]:
h_subset = ad.concat([total[total.obs.dataset == "2"], total[total.obs.dataset == "3"]])
h_subset
latent_h_subset = vae.get_latent_representation(h_subset)

Demonstrate presence of nuisance variation 

In [None]:
# run PCA then generate UMAP plots
sc.tl.pca(total)
sc.pp.neighbors(total, n_pcs=30, n_neighbors=20)
sc.tl.umap(total, min_dist=0.3)

In [None]:
sc.pl.umap(
    total,
    color=["dataset"],
    frameon=False,
)

Batch corrected visualization

In [None]:

total.obsm["X_scVI"] = latent

# use scVI latent space for UMAP generation
sc.pp.neighbors(total, use_rep="X_scVI")
sc.tl.umap(total, min_dist=0.3)

In [None]:
sc.pl.umap(
    total,
    color=["dataset"],
    frameon=False,
)

In [None]:
total.obs