# Benchmark

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

In [3]:
proj_name = "dkd_Wilson"
save_dir = f"../save/{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = f"/Volumes/T7_ZJ/Datasets/Shared_Public"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)

file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix

cpu


'Jun10-2032'

In [4]:
# load the adata
# import scanpy as sc
# data_dir = "/Volumes/T7_ZJ/Datasets/Shared_Public/"
adata = sc.read_h5ad(data_dir / "dkd_Wilson.h5ad")

In [None]:
# copy the raw counts to a new layer "counts"
# adata.layers["counts"] = adata.raw.X.copy()

In [5]:
batch_key = 'donor_id'
state_key = 'cell_type'

## preprocess data

In [6]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat_v3", batch_key=batch_key)
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


----

In [7]:
adata = adata[:, adata.var.highly_variable].copy()

In [8]:
# subset the adata to 2000 cells
n_cells = 2000
if adata.shape[0] > n_cells:
    adata = adata[np.random.choice(adata.shape[0], n_cells, replace=False), :].copy()

In [9]:
adata

AnnData object with n_obs × n_vars = 2000 × 2000
    obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rpl', 'percent.rps', 'doublet_id', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highl

In [14]:
combined_keys = [
        # "unintegrated",
        # "scanorama", "liger", "harmony",
        "scvi", "scanvi",
        # "concord",
        # "concord_class", 
        # "concord_decoder", "contrastive"
    ]

In [15]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                    # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device='cpu',                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 400/400: 100%|██████████| 400/400 [01:09<00:00,  5.85it/s, v_num=1, train_loss_step=684, train_loss_epoch=536]   

`Trainer.fit` stopped: `max_epochs=400` reached.


Epoch 400/400: 100%|██████████| 400/400 [01:09<00:00,  5.75it/s, v_num=1, train_loss_step=684, train_loss_epoch=536]

scvi completed in 69.61 sec.
Running UMAP on scvi...



[34mINFO    [0m Training for [1;36m20[0m epochs.                                                                                   


GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Epoch 20/20: 100%|██████████| 20/20 [00:07<00:00,  2.59it/s, v_num=1, train_loss_step=540, train_loss_epoch=529]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 20/20: 100%|██████████| 20/20 [00:07<00:00,  2.58it/s, v_num=1, train_loss_step=540, train_loss_epoch=529]

scanvi completed in 7.80 sec.
Running UMAP on scanvi...





✅ Selected methods completed.


In [16]:
adata

AnnData object with n_obs × n_vars = 2000 × 2000
    obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rpl', 'percent.rps', 'doublet_id', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'concord_class_class_true', 'concord_class_class_pred', 'class_prob_fibroblast', 'class_prob_endothelial cell', 'class_prob_mesangial cell', 'class_prob_podocyte', 'class_prob_leukocyte', 'class_prob_renal beta

In [11]:
# save the adata object and overwrite the existing file
adata.write_h5ad(data_dir / "dkd_Wilson.h5ad", )

In [13]:
import scvi.model