In [3]:
import scanpy as sc
import scvi
import numpy as np
import anndata as ad
from anndata import AnnData
from scipy import sparse
import pandas as pd

# paths to files and save
#dir_data = "/omics/groups/OE0533/internal/katharina/scDoRI/gastrulation/jupyter_notebooks/data/"
#dir_checkpoint = "/dkfz/cluster/gpu/checkpoints/OE0533/k552k/"

# read rna and atac anndata objects
rna_adata = scvi.data.read_h5ad("anndata_rna.h5ad")
atac_adata = scvi.data.read_h5ad("anndata_atac_peak_matrix.h5ad")

#adata.var_names_make_unique()
rna_adata.var_names_make_unique()
atac_adata.var_names_make_unique()

In [2]:
# this old object is used for filtering, because we will use those gene annotations
old = scvi.data.read_h5ad("old_anndata_rna.h5ad")

rna_adata = rna_adata[:,rna_adata.var.index.isin(old.var.index)]
annotations = pd.merge(old.var, rna_adata.var, on="gene", how = "inner")
annotations.index = old.var.index
annotations = annotations.drop(["gene", "Strand"], axis=1)
annotations = annotations.rename(columns={"Accession":"ID", "End":"end", "Start":"start", "Chromosome":"chr"})
rna_adata.var = annotations

# combine counts and accessibility matrix
matrix = sparse.hstack((rna_adata.X, atac_adata.X))

# create dataframe for rna
rna_df = rna_adata.var
rna_df["modality"] = ["Gene_expression" for i in range(len(rna_df))]

# create dataframe for atac
atac_df = atac_adata.var
atac_df = atac_df.rename(columns={"idx":"ID"})
atac_df["modality"]=["Peaks" for i in range(len(atac_df))]
atac_df = atac_df.drop(["score"], axis = 1)

# combine the two dataframes
df = rna_df.append(atac_df)

# create a combined anndata object
multiome = AnnData(
X = matrix,
obs = atac_adata.obs,
var = df,
uns = rna_adata.uns)

# create a multiome object
adata_mvi = scvi.data.organize_multiome_anndatas(multiome)

# sort the object var
adata_mvi = adata_mvi[:, adata_mvi.var["modality"].argsort()].copy()


# filter features which appear in < 1%
sc.pp.filter_genes(adata_mvi, min_cells=int(adata_mvi.shape[0] * 0.01))

MemoryError: Unable to allocate 2.73 GiB for an array with shape (733840916,) and data type float32

In [8]:
adata_mvi = scvi.data.read_h5ad("mvi_default/anndata_object")

In [21]:
adata_mvi.obs.columns

Index(['BlacklistRatio', 'nDiFrags', 'nFrags', 'nMonoFrags', 'nMultiFrags',
       'NucleosomeRatio', 'PassQC', 'PromoterRatio', 'ReadsInBlacklist',
       'ReadsInPromoter', 'ReadsInTSS', 'Sample', 'TSSEnrichment', 'barcode',
       'sample', 'nFeature_RNA', 'nCount_RNA', 'mitochondrial_percent_RNA',
       'ribosomal_percent_RNA', 'stage', 'pass_rnaQC', 'doublet_score',
       'doublet_call', 'celltype.mapped_mnn', 'celltype.score_mnn',
       'closest.cell', 'celltype.mapped_seurat', 'celltype.score_seurat',
       'TSSEnrichment_atac', 'ReadsInTSS_atac', 'PromoterRatio_atac',
       'NucleosomeRatio_atac', 'nFrags_atac', 'BlacklistRatio_atac',
       'ReadsInPeaks', 'FRIP', 'modality', '_scvi_batch', '_scvi_labels'],
      dtype='object')

In [None]:
rna_adata = scvi.data.read_h5ad("anndata_rna.h5ad")

In [16]:
help(scvi.model.MULTIVI.setup_anndata)

Help on method setup_anndata in module scvi.model._multivi:

setup_anndata(adata: anndata._core.anndata.AnnData, layer: Union[str, NoneType] = None, batch_key: Union[str, NoneType] = None, labels_key: Union[str, NoneType] = None, size_factor_key: Union[str, NoneType] = None, categorical_covariate_keys: Union[List[str], NoneType] = None, continuous_covariate_keys: Union[List[str], NoneType] = None, **kwargs) method of scvi.model.base._base_model.BaseModelMetaClass instance
    Sets up the :class:`~anndata.AnnData` object for this model.
        A mapping will be created between data fields used by this model to their respective locations in adata.
    
        None of the data in adata are modified. Only adds fields to adata.
    
    Parameters
    ----------
        layer
            if not `None`, uses this as the key in `adata.layers` for raw count data.
        batch_key
            key in `adata.obs` for batch information. Categories will automatically be converted into integer
  

In [29]:
# setup anndata for the model
scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key='modality', categorical_covariate_keys=["Sample"])

  "Training will be faster when sparse matrix is formatted as CSR. It is safe to cast before model initialization."


In [30]:
mvi = scvi.model.MULTIVI(
    adata_mvi,
    n_genes=(adata_mvi.var['modality']=='Gene_expression').sum(),
    n_regions=(adata_mvi.var['modality']=='Peaks').sum(),
    dropout_rate=0.2,
    n_layers_encoder=2,
    n_layers_decoder=2,
    n_latent=20,
    latent_distribution="normal",
)


In [None]:
mvi.train()


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Epoch 2/500:   0%|          | 1/500 [1:38:50<821:57:51, 5930.00s/it, loss=2.5e+04, v_num=1]

In [None]:
mvi = scvi.model.MULTIVI(
    adata_mvi,
    n_genes=(adata_mvi.var['modality']=='Gene_expression').sum(),
    n_regions=(adata_mvi.var['modality']=='Peaks').sum(),
    dropout_rate=0.2,
    n_layers_encoder=2,
    n_layers_decoder=2,
    n_latent=20,
    latent_distribution="normal",
)


# train the model
mvi.train()




# save model
mvi.save(dir_checkpoint + "multivi_model/")

# extract latent embedding
latent = mvi.get_latent_representation()

# add latent embedding to anndata object
adata_mvi.obsm["X_multivi"] = latent


adata_mvi.write_h5ad(dir_checkpoint + "anndata_object/")
