# Benchmark

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

  from pkg_resources import get_distribution, DistributionNotFound


In [5]:
proj_name = "cross-tissue_Eraslan"
save_dir = f"../save/{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = f"../data/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)

file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix

cuda:3


'Jun12-1520'

In [5]:
# load the adata
adata = sc.read_h5ad(data_dir / "cross-tissue_Eraslan/cross-tissue_Eraslan.h5ad")

In [6]:
adata

AnnData object with n_obs × n_vars = 209126 × 32839
    obs: 'Sample ID_prep', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'n_genes', 'fpr', 'prep', 'individual', 'nGenes', 'nUMIs', 'PercentMito', 'PercentRibo', 'Age_bin', 'Sample ID', 'donor_id', 'Sample ID short', 'RIN score from PAXgene tissue Aliquot', 'RIN score from Frozen tissue Aliquot', 'Autolysis Score', 'Sample Ischemic Time (mins)', 'scrublet', 'scrublet_score', 'batch', 'n_counts', 'tissue-individual-prep', 'Broad cell type', 'Granular cell type', 'introns', 'junctions', 'exons', 'sense', 'antisense', 'intergenic', 'exon_ratio', 'intron_ratio', 'junction_ratio', 'log10_nUMIs', 'leiden', 'leiden_tissue', 'Tissue composition', 'Cell types level 2', 'Cell types level 3', 'Broad cell type numbers', 'Broad cell type (numbers)', 'channel', 'developm

In [None]:
# save the raw counts to adata.layers
# adata.layers["counts"] = adata.raw.X.copy()

In [28]:
# save (overwrite) the adata object with layers
adata.write_h5ad(data_dir / "cross-tissue_Eraslan/cross-tissue_Eraslan.h5ad")


In [19]:
adata.obs['batch'].value_counts().to_dict()

{32: 6466,
 49: 5196,
 62: 4359,
 54: 4245,
 71: 4226,
 58: 4196,
 30: 3961,
 46: 3948,
 59: 3942,
 16: 3892,
 70: 3891,
 57: 3856,
 84: 3851,
 47: 3820,
 40: 3819,
 42: 3786,
 28: 3759,
 44: 3738,
 43: 3699,
 12: 3684,
 76: 3573,
 15: 3545,
 51: 3542,
 34: 3497,
 69: 3491,
 39: 3445,
 38: 3444,
 67: 3434,
 48: 3397,
 27: 3396,
 75: 3296,
 82: 3271,
 14: 3269,
 55: 3003,
 24: 2964,
 83: 2794,
 87: 2711,
 2: 2705,
 81: 2589,
 33: 2577,
 79: 2577,
 36: 2511,
 23: 2497,
 20: 2374,
 73: 2291,
 37: 2273,
 50: 2269,
 56: 2205,
 68: 2111,
 72: 2059,
 31: 2015,
 77: 1993,
 80: 1988,
 19: 1862,
 0: 1777,
 52: 1725,
 74: 1714,
 18: 1674,
 53: 1650,
 86: 1628,
 61: 1607,
 26: 1561,
 41: 1530,
 3: 1507,
 35: 1435,
 25: 1363,
 22: 1211,
 29: 1179,
 13: 1010,
 17: 964,
 78: 940,
 4: 892,
 10: 888,
 7: 713,
 85: 622,
 6: 574,
 45: 561,
 66: 545,
 65: 493,
 60: 439,
 11: 292,
 63: 242,
 64: 222,
 8: 196,
 92: 161,
 5: 145,
 9: 80,
 21: 78,
 91: 74,
 93: 74,
 90: 22,
 94: 17,
 88: 16,
 89: 2,
 1: 1}

In [17]:
adata.obs['cell_type'].value_counts()

cell_type
fibroblast                                        23901
pulmonary alveolar epithelial cell                22305
endothelial cell of vascular tree                 20883
skeletal muscle fiber                             19491
luminal cell of prostate epithelium               11104
cardiac muscle cell                                9619
enteric smooth muscle cell                         9159
basal epithelial cell of tracheobronchial tree     8145
macrophage                                         8004
basal epithelial cell of prostatic duct            7295
skeletal muscle fibroblast                         5913
luminal epithelial cell of mammary gland           5546
alveolar macrophage                                5229
contractile cell                                   5078
smooth muscle cell                                 4594
epithelial cell of prostate                        4587
endothelial cell of lymphatic vessel               4504
T cell                                

In [9]:
batch_key = 'batch'
state_key = 'cell_type'

In [27]:
# adata.obs[batch_key] = adata.obs[batch_key].astype("category")
# print(adata.obs[batch_key].dtype)


# preprocess data

In [6]:
adata = sc.read_h5ad(data_dir / "cross-tissue_Eraslan/cross-tissue_Eraslan.h5ad")

In [10]:
adata

AnnData object with n_obs × n_vars = 209126 × 32839
    obs: 'Sample ID_prep', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'n_genes', 'fpr', 'prep', 'individual', 'nGenes', 'nUMIs', 'PercentMito', 'PercentRibo', 'Age_bin', 'Sample ID', 'donor_id', 'Sample ID short', 'RIN score from PAXgene tissue Aliquot', 'RIN score from Frozen tissue Aliquot', 'Autolysis Score', 'Sample Ischemic Time (mins)', 'scrublet', 'scrublet_score', 'batch', 'n_counts', 'tissue-individual-prep', 'Broad cell type', 'Granular cell type', 'introns', 'junctions', 'exons', 'sense', 'antisense', 'intergenic', 'exon_ratio', 'intron_ratio', 'junction_ratio', 'log10_nUMIs', 'leiden', 'leiden_tissue', 'Tissue composition', 'Cell types level 2', 'Cell types level 3', 'Broad cell type numbers', 'Broad cell type (numbers)', 'channel', 'developm

In [11]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat", batch_key=batch_key)
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)

In [12]:
adata = adata[:, adata.var.highly_variable].copy()

In [None]:
# save the adata
# adata.write_h5ad(data_dir / "cross-tissue_Eraslan/cross-tissue_Eraslan_processed.h5ad")

In [15]:
adata

AnnData object with n_obs × n_vars = 209126 × 2000
    obs: 'Sample ID_prep', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'n_genes', 'fpr', 'prep', 'individual', 'nGenes', 'nUMIs', 'PercentMito', 'PercentRibo', 'Age_bin', 'Sample ID', 'donor_id', 'Sample ID short', 'RIN score from PAXgene tissue Aliquot', 'RIN score from Frozen tissue Aliquot', 'Autolysis Score', 'Sample Ischemic Time (mins)', 'scrublet', 'scrublet_score', 'batch', 'n_counts', 'tissue-individual-prep', 'Broad cell type', 'Granular cell type', 'introns', 'junctions', 'exons', 'sense', 'antisense', 'intergenic', 'exon_ratio', 'intron_ratio', 'junction_ratio', 'log10_nUMIs', 'leiden', 'leiden_tissue', 'Tissue composition', 'Cell types level 2', 'Cell types level 3', 'Broad cell type numbers', 'Broad cell type (numbers)', 'channel', 'developme

----

In [31]:
combined_keys = [
        "unintegrated",
        "scanorama", "liger", "harmony",
        "scvi", "scanvi",
        "concord",
        "concord_class", 
        "concord_decoder", "contrastive"
    ]

In [34]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                    # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device=device,                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)



FAISS not found. Using sklearn for k-NN computation.


p_intra_knn: 0.3


Epoch 0 Training: 3229it [01:11, 44.96it/s, loss=3.69] 
Epoch 1 Training: 100%|██████████| 3229/3229 [01:10<00:00, 46.13it/s, loss=3.54] 
Epoch 2 Training: 100%|██████████| 3229/3229 [01:10<00:00, 46.05it/s, loss=3.48] 
Epoch 3 Training: 100%|██████████| 3229/3229 [01:10<00:00, 45.91it/s, loss=3.68] 
Epoch 4 Training: 100%|██████████| 3229/3229 [01:10<00:00, 45.68it/s, loss=3.69] 
Epoch 5 Training: 100%|██████████| 3229/3229 [01:10<00:00, 45.72it/s, loss=3.41] 
Epoch 6 Training: 100%|██████████| 3229/3229 [01:10<00:00, 45.69it/s, loss=3.55] 
Epoch 7 Training: 100%|██████████| 3229/3229 [01:11<00:00, 45.43it/s, loss=3.58] 
Epoch 8 Training: 100%|██████████| 3229/3229 [01:11<00:00, 45.15it/s, loss=3.58] 
Epoch 9 Training: 100%|██████████| 3229/3229 [01:11<00:00, 45.01it/s, loss=3.42] 




concord completed in 724.71 sec.
Running UMAP on concord...




FAISS not found. Using sklearn for k-NN computation.


p_intra_knn: 0.3


Epoch 0 Training: 3229it [01:15, 42.78it/s, loss=4.37]
Epoch 1 Training: 100%|██████████| 3229/3229 [01:10<00:00, 45.67it/s, loss=4.07] 
Epoch 2 Training: 100%|██████████| 3229/3229 [01:09<00:00, 46.44it/s, loss=3.82] 
Epoch 3 Training: 100%|██████████| 3229/3229 [01:09<00:00, 46.52it/s, loss=3.89] 
Epoch 4 Training: 100%|██████████| 3229/3229 [01:09<00:00, 46.60it/s, loss=4.22] 
Epoch 5 Training: 100%|██████████| 3229/3229 [01:09<00:00, 46.52it/s, loss=3.86] 
Epoch 6 Training: 100%|██████████| 3229/3229 [01:14<00:00, 43.17it/s, loss=3.56] 
Epoch 7 Training: 100%|██████████| 3229/3229 [01:18<00:00, 41.30it/s, loss=3.59]
Epoch 8 Training: 100%|██████████| 3229/3229 [01:18<00:00, 41.30it/s, loss=4.01]
Epoch 9 Training: 100%|██████████| 3229/3229 [01:18<00:00, 41.36it/s, loss=3.8] 




concord_class completed in 753.28 sec.
Running UMAP on concord_class...




FAISS not found. Using sklearn for k-NN computation.


p_intra_knn: 0.3


Epoch 0 Training: 3229it [01:13, 43.95it/s, loss=3.76] 
Epoch 1 Training: 100%|██████████| 3229/3229 [01:07<00:00, 47.79it/s, loss=3.87] 
Epoch 2 Training: 100%|██████████| 3229/3229 [01:08<00:00, 47.48it/s, loss=3.83] 
Epoch 3 Training: 100%|██████████| 3229/3229 [01:07<00:00, 47.55it/s, loss=3.9]  
Epoch 4 Training: 100%|██████████| 3229/3229 [01:07<00:00, 47.52it/s, loss=3.91] 
Epoch 5 Training: 100%|██████████| 3229/3229 [01:07<00:00, 47.52it/s, loss=4.07] 
Epoch 6 Training: 100%|██████████| 3229/3229 [01:08<00:00, 47.34it/s, loss=3.87] 
Epoch 7 Training: 100%|██████████| 3229/3229 [01:09<00:00, 46.53it/s, loss=3.69] 
Epoch 8 Training: 100%|██████████| 3229/3229 [01:09<00:00, 46.74it/s, loss=3.87] 
Epoch 9 Training: 100%|██████████| 3229/3229 [01:10<00:00, 45.81it/s, loss=4.06] 




concord_decoder completed in 703.88 sec.
Running UMAP on concord_decoder...




FAISS not found. Using sklearn for k-NN computation.
You specified p_intra_domain as 0.95 but you only have one domain. Resetting p_intra_domain to 1.0.


p_intra_knn: 0.3


Epoch 0 Training: 3267it [01:06, 49.04it/s, loss=3.43] 
Epoch 1 Training: 100%|██████████| 3267/3267 [01:06<00:00, 48.93it/s, loss=3.41] 
Epoch 2 Training: 100%|██████████| 3267/3267 [01:06<00:00, 48.96it/s, loss=3.32] 
Epoch 3 Training: 100%|██████████| 3267/3267 [01:05<00:00, 49.71it/s, loss=3.32] 
Epoch 4 Training: 100%|██████████| 3267/3267 [01:05<00:00, 49.77it/s, loss=3.42] 
Epoch 5 Training: 100%|██████████| 3267/3267 [01:05<00:00, 49.60it/s, loss=3.35] 
Epoch 6 Training: 100%|██████████| 3267/3267 [01:06<00:00, 49.37it/s, loss=3.34] 
Epoch 7 Training: 100%|██████████| 3267/3267 [01:06<00:00, 49.27it/s, loss=3.25] 
Epoch 8 Training: 100%|██████████| 3267/3267 [01:06<00:00, 49.33it/s, loss=3.33] 
Epoch 9 Training: 100%|██████████| 3267/3267 [01:06<00:00, 49.07it/s, loss=3.32] 




contrastive completed in 675.58 sec.
Running UMAP on contrastive...
Running UMAP on unintegrated...


: 