## MultiVI train     

In [1]:
import scvi
scvi.settings.seed = 420
import numpy as np
import scanpy as sc
from scipy.sparse import csr_matrix
from scipy.io import mmread
import pandas as pd
import anndata
import seaborn as sns
import torch
import random
import h5py
import os 
import matplotlib.pyplot as plt

Seed set to 420


In [2]:
sc.set_figure_params(figsize=(6, 6), frameon=False)
sns.set_theme()
%config InlineBackend.print_figure_kwargs={"facecolor": "w"}
%config InlineBackend.figure_format="retina"

In [3]:
input_path = '../'
rna4intra_raw = sc.read_h5ad(f'{input_path}/0_data/processed_data/m_brain_paired_rna_raw.h5ad')  ## provide training data(input)
rna4intra_raw.var_names_make_unique()
rna4intra = sc.read_h5ad(f'{input_path}/0_data/processed_data/m_brain_paired_rna.h5ad') ## provide celltype and X_pca
rna4intra_raw.obsm['X_pca'] = rna4intra.obsm['X_pca']
atac4intra_raw = sc.read_h5ad(f'{input_path}/1_XChrom/0_model_train/train_data/ad.h5ad')
adata_paired_atac = sc.read_h5ad(f'{input_path}/1_XChrom/0_model_train/train_data/ad_trainval.h5ad') ## training data(label)
adata_atac = sc.read_h5ad(f'{input_path}/1_XChrom/0_model_train/train_data/ad_crosscell.h5ad')  ## test data(label)
with h5py.File(f'{input_path}/1_XChrom/0_model_train/train_data/splits.h5', 'r') as hf:
    trainval_cellid = hf['trainval_cell'][:]
    test_cellid = hf['test_cell'][:]
    trainval_peakid = hf['trainval_peak'][:]
adata_paired_rna = rna4intra_raw[trainval_cellid,]  ## training data(input)
adata_rna = rna4intra_raw[test_cellid,]  ## test data(input)
atac4intra = atac4intra_raw[:,trainval_peakid]  ## all data raw_ATAC = adata_paired_atac + adata_atac
save_path = './Results/'

  utils.warn_names_duplicates("var")


In [4]:
common_genes = adata_paired_rna.var_names.intersection(adata_rna.var_names)
adata_paired_rna = adata_paired_rna[:, common_genes]
adata_rna = adata_rna[:, common_genes]

modality = ['Gene Expression']*adata_rna.shape[1]+['Peaks']*adata_atac.shape[1]
adata_paired = anndata.concat([adata_paired_rna, adata_paired_atac], merge = "same",axis=1)
adata_paired.var_names_make_unique()
adata_paired.var['modality']=modality
adata_paired.var_names_make_unique()
adata_rna.var['modality']=['Gene Expression']*adata_rna.shape[1]

adata_mvi = scvi.data.organize_multiome_anndatas(adata_paired, rna_anndata=adata_rna)
## default:modality_key ='modality',indicate the source of cells, and a column will be automatically added to adata.obs
adata_mvi = adata_mvi[:, adata_mvi.var["modality"].argsort()].copy()  
## MultiVI requires the features to be ordered, such that genes appear before genomic regions.
sc.pp.filter_genes(adata_mvi, min_cells=int(adata_mvi.shape[0] * 0.001))  ## Filter out genes expressed in less than 0.1% of the cells
scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key='modality')
mvi = scvi.model.MULTIVI(adata_mvi,
                         n_genes=(adata_mvi.var['modality']=='Gene Expression').sum(),
                         n_regions=(adata_mvi.var['modality']=='Peaks').sum()) 
mvi.view_anndata_setup()
mvi.train(max_epochs=100)  ## early_stopping_monitor="reconstruction_loss_validation"
mvi.save(f'{save_path}', overwrite=True)
# model = scvi.model.MULTIVI.load(f'{save_path}', adata=adata_mvi)

  adata_rna.var['modality']=['Gene Expression']*adata_rna.shape[1]
  return multi_anndata.concatenate(other, join="outer", batch_key=modality_key)


  mvi.train(max_epochs=100)  ## early_stopping_monitor="reconstruction_loss_validation"
Trainer will use only 1 of 4 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=4)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]
/home/miaoyuanyuan/miniconda3/envs/scvipy311/lib/python3.11/site-packages/lightning/pytorch/trainer/conn

Training:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=100` reached.


In [5]:
imputed_accessibility = mvi.get_accessibility_estimates()  ## all data denoise
pred = imputed_accessibility[adata_paired.n_obs:]  ## cross-cell test prediction
obs_name = [name.rsplit('_',1)[0] for name in list(pred.index)]
true_list = adata_atac[obs_name,list(imputed_accessibility)]
true = pd.DataFrame(true_list.X.toarray(), columns= true_list.var_names,index= true_list.obs_names)
pred.index = obs_name
pred_sort = pred.reindex(index=adata_atac.obs_names, columns=adata_atac.var_names)
adata_atacp = adata_atac.copy()
adata_atacp.X = np.array(pred_sort)
adata_atacp.write_h5ad(f'{save_path}/multivi_impute.h5ad')

imp_all = imputed_accessibility.copy()
obs_name = [name.rsplit('_',1)[0] for name in list(imp_all.index)]
imp_all.index = obs_name
imp_all_sort = imp_all.reindex(index=atac4intra.obs_names, columns=atac4intra.var_names)
atac4intrap = atac4intra.copy()
atac4intrap.X = np.array(imp_all_sort)
atac4intrap.write_h5ad(f'{save_path}/multivi_imp_all.h5ad')