In [3]:
import os
os.chdir("/athena/marchionnilab/scratch/lab_data/Mohamed/pca_TME")
import scanpy as sc
import torch
import scarches as sca
from scarches.dataset.trvae.data_handling import remove_sparsity
import matplotlib.pyplot as plt
import numpy as np
import gdown
import scipy.sparse as sp
import scvi

scvi.settings.seed = 94705
scvi.settings.batch_size = 256
scvi.settings.progress_bar_style = "rich"
scvi.settings.num_threads = 16

sc.set_figure_params(figsize=(4, 4))
sc._settings.ScanpyConfig(max_memory = 200, n_jobs = 16)


%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

Global seed set to 94705


In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [5]:
sc.settings.set_figure_params(dpi=200, frameon=False)
sc.set_figure_params(dpi=200)
sc.set_figure_params(figsize=(4, 4))
torch.set_printoptions(precision=3, sci_mode=False, edgeitems=7)

## Reference data: mouse


In [6]:
adata_mouse = sc.read_h5ad('outs/h5ads/fapcm_fibroblasts_v6_clean_regulons_5.h5ad', chunk_size=100000)

In [7]:
# filter the na
adata_mouse.obs['cluster'] = adata_mouse.obs['cluster'].astype('str')
adata_mouse = adata_mouse[adata_mouse.obs['cluster'] != 'nan', :]
adata_mouse.obs['cluster'].value_counts()

1    2541
0    1442
5     997
3     903
6     895
7     753
4     733
2     310
Name: cluster, dtype: int64

In [8]:
# raw
adata_mouse_raw = adata_mouse.copy()
adata_mouse_raw = adata_mouse_raw.raw.to_adata()
adata_mouse_raw.X = sp.csr_matrix.todense(adata_mouse_raw.X)
adata_mouse_raw.X = adata_mouse_raw.to_df()


In [9]:
adata_mouse_raw.layers['counts'] = adata_mouse.raw.X

In [10]:
adata_mouse_raw

AnnData object with n_obs × n_vars = 8574 × 23278
    obs: 'batch', 'key', 'model', 'condition', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'n_genes', 'percent_mito', 'S_score', 'G2M_score', 'phase', 'scrubletscores', 'scrubletdoublets', 'leiden', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'endothelial', 'fibroblast', 'myofibroblast', 'dendritic', 'cCDs', 'langherhans_like', 'b', 't_nk', 'myeloid', 'mast', 'luminal', 'basal', 'notluminal', 'macrophages', 'neuroendocrine', 'seminal_vesicle_basal', 'seminal_vesicle_luminal', 'seminal_vesicle_ionocyte', 'Regulon(Arid5a)', 'Regulon(Arid5b)', 'Regulon(Ascl1)', 'Regulon(Ascl2)', 'Regulon(Atf3)', 'Regulon(Bach1)', 'Regulon(Batf)', 'Regulon(Bcl3)', 'Regulon(Cebpa)', 'Regulon(Cebpb)', 'Regulon(Cebpd)', 'Regulon(Creb5)', 'Regulon(Crem)', 'Regulon(Dusp26)', 'Regulon(Egr1)', 'Regulon(Egr2)', 'Regulon(Egr3)', 'Regulon(Egr4)', 'Regulon(Eomes)', 'Regulon(Erg)', 'Regulon(Ets1)', 

In [11]:
adata_mouse_raw.obs.model.value_counts()

MNRP DKO       4510
Fvbn           1087
T-ERG          1085
Hi-MYC          864
B6.129          656
B6              287
Pten$^{KO}$      85
Name: model, dtype: int64

In [12]:
adata_mouse_raw

AnnData object with n_obs × n_vars = 8574 × 23278
    obs: 'batch', 'key', 'model', 'condition', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'n_genes', 'percent_mito', 'S_score', 'G2M_score', 'phase', 'scrubletscores', 'scrubletdoublets', 'leiden', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var', 'endothelial', 'fibroblast', 'myofibroblast', 'dendritic', 'cCDs', 'langherhans_like', 'b', 't_nk', 'myeloid', 'mast', 'luminal', 'basal', 'notluminal', 'macrophages', 'neuroendocrine', 'seminal_vesicle_basal', 'seminal_vesicle_luminal', 'seminal_vesicle_ionocyte', 'Regulon(Arid5a)', 'Regulon(Arid5b)', 'Regulon(Ascl1)', 'Regulon(Ascl2)', 'Regulon(Atf3)', 'Regulon(Bach1)', 'Regulon(Batf)', 'Regulon(Bcl3)', 'Regulon(Cebpa)', 'Regulon(Cebpb)', 'Regulon(Cebpd)', 'Regulon(Creb5)', 'Regulon(Crem)', 'Regulon(Dusp26)', 'Regulon(Egr1)', 'Regulon(Egr2)', 'Regulon(Egr3)', 'Regulon(Egr4)', 'Regulon(Eomes)', 'Regulon(Erg)', 'Regulon(Ets1)', 

In [13]:
adata_mouse_raw.obs.drop( ['_scvi_batch','_scvi_labels','_scvi_local_l_mean','_scvi_local_l_var'], axis=1, inplace=True )


In [14]:
del adata_mouse_raw.uns['_scvi']


In [15]:
del adata_mouse_raw.obsm['X_scVI']

## Target data: human

In [16]:
# load the human data
adata_human = sc.read_h5ad('outs_human/h5ads/erg_fibroblasts_scvi_v6_regulons.h5ad', chunk_size=100000)

In [17]:
# raw
# save the normalized data (not z-scored) for cellchat
adata_human_raw = adata_human.copy()
adata_human_raw = adata_human_raw.raw.to_adata()
adata_human_raw.X = sp.csr_matrix.todense(adata_human_raw.X)
adata_human_raw.X = adata_human_raw.to_df()

In [18]:
# change the var_names to match mouse gene symbols
adata_human_raw.var_names = [gene.title() for gene in adata_human_raw.var_names]

In [19]:
adata_human_raw.var_names

Index(['A1Bg', 'A1Bg-As1', 'A1Cf', 'A2M', 'A2M-As1', 'A2Ml1', 'A2Ml1-As1',
       'A2Ml1-As2', 'A3Galt2', 'A4Galt',
       ...
       'Zw10', 'Zwilch', 'Zwint', 'Zxda', 'Zxdb', 'Zxdc', 'Zyg11A', 'Zyg11B',
       'Zyx', 'Zzef1'],
      dtype='object', length=30631)

## find common genes


In [20]:
var_names = adata_mouse_raw.var_names.intersection(adata_human_raw.var_names)
len(var_names)

12017

In [21]:
# subset
adata_mouse_raw = adata_mouse_raw[:, var_names]
adata_human_raw = adata_human_raw[:, var_names]

## Create SCANVI model and train it on fully labelled reference dataset

In [22]:
adata_mouse_raw = remove_sparsity(adata_mouse_raw)

In [23]:
adata_mouse_raw2 = adata_mouse_raw.copy()

In [24]:
scvi.model.SCVI.setup_anndata(adata_mouse_raw2, batch_key="model", layer="counts")


In [None]:
arches_params = dict(
    use_layer_norm="both",
    use_batch_norm="none",
    encode_covariates=True,
    dropout_rate=0.2,
    n_layers=2,
)

vae_ref = scvi.model.SCVI(
    adata_mouse_raw2,
    **arches_params
)
vae_ref.train()

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
Set SLURM handle signals.


Epoch 400/400: 100%|█████████▉| 399/400 [33:35<00:04,  4.92s/it, loss=3.47e+03, v_num=1]

In [24]:
adata_mouse_raw2.obsm["X_scVI"] = vae_ref.get_latent_representation(batch_size = 128)

In [1]:
sc.pp.neighbors(adata_mouse_raw2, use_rep="X_scVI", random_state = 94705)

NameError: name 'sc' is not defined

In [None]:
sc.tl.umap(adata_mouse_raw2)
sc.pl.umap(
    adata_mouse_raw2,
    color=["model", "cluster"],
    frameon=False,
    ncols=2,
)

In [None]:
## Now we obtain the latent representation, and use Scanpy to visualize with UMAP.

adata_mouse_raw2.obsm["X_scVI"] = vae_ref.get_latent_representation()
sc.pp.neighbors(adata_mouse_raw2, use_rep="X_scVI")
sc.tl.leiden(adata_mouse_raw2)
sc.tl.umap(adata_mouse_raw2)
sc.pl.umap(
    adata_mouse_raw2,
    color=["model", "cluster"],
    frameon=False,
    ncols=2,
)

In [25]:
adata_mouse_raw2

AnnData object with n_obs × n_vars = 8574 × 12017
    obs: 'batch', 'key', 'model', 'condition', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_counts', 'n_genes', 'percent_mito', 'S_score', 'G2M_score', 'phase', 'scrubletscores', 'scrubletdoublets', 'leiden', 'endothelial', 'fibroblast', 'myofibroblast', 'dendritic', 'cCDs', 'langherhans_like', 'b', 't_nk', 'myeloid', 'mast', 'luminal', 'basal', 'notluminal', 'macrophages', 'neuroendocrine', 'seminal_vesicle_basal', 'seminal_vesicle_luminal', 'seminal_vesicle_ionocyte', 'Regulon(Arid5a)', 'Regulon(Arid5b)', 'Regulon(Ascl1)', 'Regulon(Ascl2)', 'Regulon(Atf3)', 'Regulon(Bach1)', 'Regulon(Batf)', 'Regulon(Bcl3)', 'Regulon(Cebpa)', 'Regulon(Cebpb)', 'Regulon(Cebpd)', 'Regulon(Creb5)', 'Regulon(Crem)', 'Regulon(Dusp26)', 'Regulon(Egr1)', 'Regulon(Egr2)', 'Regulon(Egr3)', 'Regulon(Egr4)', 'Regulon(Eomes)', 'Regulon(Erg)', 'Regulon(Ets1)', 'Regulon(Fezf1)', 'Regulon(Fosb)', 'Regulon(Fosl2)', 'Regulon(Foxa1)', 'Re

In [245]:
vae = sca.models.SCVI(
    adata_mouse_raw2,
    n_layers=2,
    encode_covariates=True,
    deeply_inject_covariates=False,
    use_layer_norm="both",
    use_batch_norm="none",
)

ValueError: Please set up your AnnData with SCVI.setup_anndata first. It appears the AnnData object has been setup with a different model.