# Benchmark 

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

  from pkg_resources import get_distribution, DistributionNotFound


In [3]:
proj_name = "pbmc_Darmanis"
save_dir = f"../save/{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = f"../data/{proj_name}/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)

file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix

cpu


'Jun11-1625'

# Download CellxGene AnnData Objects

In [4]:
##USER Defined Anndata Object Directory
##Keep this if you want to use immediately with other notebooks
ANNDATA_OBJECT_DIR='../data/pbmc_Darmanis/'


In [6]:
##Define domain names for cellxgene
domain_name = "cellxgene.cziscience.com"
site_url = f"https://{domain_name}"
api_url_base = f"https://api.{domain_name}"

##Define specific collection ID for this study
collection_id = "398e34a9-8736-4b27-a9a7-31a47a67f446"

##Fetch collection
collection_path = f"/curation/v1/collections/{collection_id}"
collection_url = f"{api_url_base}{collection_path}"
res = requests.get(url=collection_url)
res.raise_for_status()
res_content = res.json()

In [7]:
kits_downloaded = []
for dataset in res_content['datasets']:
    assets = dataset["assets"]
    dataset_id = dataset["dataset_id"]
    kit_name = dataset['title']
    kits_downloaded.append(kit_name)
    for asset in assets:
        if asset['filetype'] == 'H5AD':
            download_filename = os.path.join(ANNDATA_OBJECT_DIR, f'{kit_name}_annotated.h5ad')
            print(f"\nDownloading {kit_name} to {download_filename} ... ")
            with requests.get(asset["url"], stream=True) as res:
                res.raise_for_status()
                filesize = int(res.headers["Content-Length"])
                with open(download_filename, "wb") as df:
                    total_bytes_received = 0
                    for chunk in res.iter_content(chunk_size=1024 * 1024):
                        df.write(chunk)
                        total_bytes_received += len(chunk)
                        percent_of_total_upload = float("{:.1f}".format(total_bytes_received / filesize * 100))
                        color = "\033[38;5;10m" if percent_of_total_upload == 100 else ""
                        print(f"\033[1m{color}{percent_of_total_upload}% downloaded\033[0m\r", end="")
    print("\n\nDone downloading assets")


Downloading Honeycomb-rep2 to ../results/anndata_objects/Honeycomb-rep2_annotated.h5ad ... 
[1m[38;5;10m100.0% downloaded[0m

Done downloading assets

Downloading 10X_FRP-rep1 to ../results/anndata_objects/10X_FRP-rep1_annotated.h5ad ... 
[1m[38;5;10m100.0% downloaded[0m

Done downloading assets

Downloading 10X_3-rep1 to ../results/anndata_objects/10X_3-rep1_annotated.h5ad ... 
[1m[38;5;10m100.0% downloaded[0m

Done downloading assets

Downloading BD-rep1 to ../results/anndata_objects/BD-rep1_annotated.h5ad ... 
[1m[38;5;10m100.0% downloaded[0m

Done downloading assets

Downloading harmony_integrated_data to ../results/anndata_objects/harmony_integrated_data_annotated.h5ad ... 
[1m[38;5;10m100.0% downloaded[0m

Done downloading assets

Downloading 10X_5-rep2 to ../results/anndata_objects/10X_5-rep2_annotated.h5ad ... 
[1m[38;5;10m100.0% downloaded[0m

Done downloading assets

Downloading 10X_3-rep2 to ../results/anndata_objects/10X_3-rep2_annotated.h5ad ... 
[1m[3

In [8]:
kits_downloaded = sorted(kits_downloaded)
print(f'Downloaded Kit Data: {kits_downloaded}')


Downloaded Kit Data: ['10X_3-rep1', '10X_3-rep2', '10X_5-rep1', '10X_5-rep2', '10X_FRP-rep1', '10X_FRP-rep2', 'BD-rep1', 'BD-rep2', 'Fluent-rep1', 'Fluent-rep2', 'Fluent-rep3', 'Honeycomb-rep1', 'Honeycomb-rep2', 'Parse-rep1', 'Scale-rep1', 'Scipio-rep1', 'Scipio-rep2', 'harmony_integrated_data']


----

In [6]:
ANNDATA_OBJECT_DIR='../data/pbmc_Darmanis/'


# concatenate all anndata objects except the harmony_integrated_data_annotated.h5ad
anndata_objects = [f for f in Path(ANNDATA_OBJECT_DIR).glob('*.h5ad') if 'harmony_integrated_data_annotated.h5ad' not in str(f)]
print(f'Anndata Objects: {anndata_objects}')
# concatenate all anndata objects
adata_list = [sc.read_h5ad(f) for f in anndata_objects]
adata = adata_list[0].concatenate(adata_list[1:], batch_key='dataset', batch_categories=[f.stem for f in anndata_objects])
# save the concatenated anndata object
adata.write_h5ad(os.path.join(ANNDATA_OBJECT_DIR, 'pbmc_Darmanis.h5ad'))
print(f'Concatenated Anndata Object saved to {ANNDATA_OBJECT_DIR}/pbmc_Darmanis.h5ad')

Anndata Objects: [PosixPath('../data/pbmc_Darmanis/Honeycomb-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/BD-rep2_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/BD-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/Honeycomb-rep2_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/10X_5-rep2_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/Fluent-rep2_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/10X_5-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/Fluent-rep3_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/Fluent-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/Parse-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/10X_FRP-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/Scale-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/10X_FRP-rep2_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/10X_3-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/Scipio-rep1_annotated.h5ad'), PosixPath('../data/pbmc_Darmanis/10X_3-rep

****

In [None]:
# copy the raw counts to a new layer
# adata.layers['counts'] = adata.raw.X.copy()

In [10]:
batch_key = 'dataset'
state_key = 'cell_type'

In [6]:
combined_keys = [
        # "unintegrated",
        # "scanorama", "liger", "harmony",
        # "scvi", "scanvi",
        "concord",
        "concord_class", 
        "concord_decoder", "contrastive"
    ]

In [7]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                    # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device='cpu',                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)



FAISS not found. Using sklearn for k-NN computation.


p_intra_knn: 0.3


Epoch 0 Training: 119it [00:04, 27.04it/s, loss=3.57]
Epoch 1 Training: 100%|██████████| 119/119 [00:03<00:00, 34.47it/s, loss=3.26]
Epoch 2 Training: 100%|██████████| 119/119 [00:03<00:00, 34.93it/s, loss=3.41]
Epoch 3 Training: 100%|██████████| 119/119 [00:03<00:00, 35.86it/s, loss=3.36]
Epoch 4 Training: 100%|██████████| 119/119 [00:03<00:00, 35.01it/s, loss=3.46]
Epoch 5 Training: 100%|██████████| 119/119 [00:04<00:00, 28.89it/s, loss=3.52]
Epoch 6 Training: 100%|██████████| 119/119 [00:03<00:00, 31.00it/s, loss=3.35]
Epoch 7 Training: 100%|██████████| 119/119 [00:03<00:00, 34.88it/s, loss=3.37]
Epoch 8 Training: 100%|██████████| 119/119 [00:03<00:00, 37.86it/s, loss=3.25]
Epoch 9 Training: 100%|██████████| 119/119 [00:03<00:00, 38.94it/s, loss=3.39]




concord completed in 39.28 sec.
Running UMAP on concord...
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.




FAISS not found. Using sklearn for k-NN computation.


p_intra_knn: 0.3


Epoch 0 Training: 119it [00:03, 37.74it/s, loss=4.24]
Epoch 1 Training: 100%|██████████| 119/119 [00:03<00:00, 37.93it/s, loss=3.71]
Epoch 2 Training: 100%|██████████| 119/119 [00:03<00:00, 37.82it/s, loss=3.69]
Epoch 3 Training: 100%|██████████| 119/119 [00:03<00:00, 37.74it/s, loss=3.68]
Epoch 4 Training: 100%|██████████| 119/119 [00:03<00:00, 37.62it/s, loss=3.72]
Epoch 5 Training: 100%|██████████| 119/119 [00:03<00:00, 38.43it/s, loss=3.73]
Epoch 6 Training: 100%|██████████| 119/119 [00:03<00:00, 37.98it/s, loss=3.48]
Epoch 7 Training: 100%|██████████| 119/119 [00:03<00:00, 38.06it/s, loss=3.41]
Epoch 8 Training: 100%|██████████| 119/119 [00:03<00:00, 36.46it/s, loss=3.45]
Epoch 9 Training: 100%|██████████| 119/119 [00:03<00:00, 37.68it/s, loss=3.38]




concord_class completed in 34.01 sec.
Running UMAP on concord_class...




FAISS not found. Using sklearn for k-NN computation.


p_intra_knn: 0.3


Epoch 0 Training: 119it [00:05, 20.13it/s, loss=4.48]
Epoch 1 Training: 100%|██████████| 119/119 [00:05<00:00, 22.26it/s, loss=4.42]
Epoch 2 Training: 100%|██████████| 119/119 [00:05<00:00, 20.43it/s, loss=4.41]
Epoch 3 Training: 100%|██████████| 119/119 [00:05<00:00, 22.41it/s, loss=4.59]
Epoch 4 Training: 100%|██████████| 119/119 [00:06<00:00, 18.53it/s, loss=4.24]
Epoch 5 Training: 100%|██████████| 119/119 [00:05<00:00, 21.86it/s, loss=4.69]
Epoch 6 Training: 100%|██████████| 119/119 [00:05<00:00, 21.82it/s, loss=4.31]
Epoch 7 Training: 100%|██████████| 119/119 [00:05<00:00, 21.64it/s, loss=4.49]
Epoch 8 Training: 100%|██████████| 119/119 [00:05<00:00, 21.41it/s, loss=4.31]
Epoch 9 Training: 100%|██████████| 119/119 [00:05<00:00, 21.60it/s, loss=4.54]




concord_decoder completed in 59.73 sec.
Running UMAP on concord_decoder...




FAISS not found. Using sklearn for k-NN computation.
You specified p_intra_domain as 0.95 but you only have one domain. Resetting p_intra_domain to 1.0.


p_intra_knn: 0.3


Epoch 0 Training: 132it [00:03, 34.67it/s, loss=3.48]
Epoch 1 Training: 100%|██████████| 132/132 [00:03<00:00, 36.47it/s, loss=3.43]
Epoch 2 Training: 100%|██████████| 132/132 [00:03<00:00, 37.55it/s, loss=3.38]
Epoch 3 Training: 100%|██████████| 132/132 [00:03<00:00, 36.90it/s, loss=3.29]
Epoch 4 Training: 100%|██████████| 132/132 [00:04<00:00, 30.99it/s, loss=3.29]
Epoch 5 Training: 100%|██████████| 132/132 [00:04<00:00, 31.35it/s, loss=3.25]
Epoch 6 Training: 100%|██████████| 132/132 [00:04<00:00, 31.19it/s, loss=3.38]
Epoch 7 Training: 100%|██████████| 132/132 [00:04<00:00, 31.55it/s, loss=3.3] 
Epoch 8 Training: 100%|██████████| 132/132 [00:04<00:00, 30.32it/s, loss=3.31]
Epoch 9 Training: 100%|██████████| 132/132 [00:04<00:00, 28.09it/s, loss=3.26]




contrastive completed in 43.79 sec.
Running UMAP on contrastive...
✅ Selected methods completed.


----

# downsample the anndata objects

In [5]:
# load the adata object
ANNDATA_OBJECT_DIR='../data/pbmc_Darmanis/'
adata = sc.read_h5ad(os.path.join(ANNDATA_OBJECT_DIR, 'pbmc_Darmanis.h5ad'))

In [6]:
# for each dataset, get 500 cells and save to a new anndata object with all datasets
adata_list = []
for dataset in adata.obs['dataset'].unique():
    adata_subset = adata[adata.obs['dataset'] == dataset].copy()
    if adata_subset.n_obs > 500:
        adata_subset = adata_subset[np.random.choice(adata_subset.n_obs, 500, replace=False)]
    adata_list.append(adata_subset)
# concatenate all anndata objects
adata = adata_list[0].concatenate(adata_list[1:], batch_key='dataset', batch_categories=adata.obs['dataset'].unique())
# save the concatenated anndata object
adata.write_h5ad(os.path.join(ANNDATA_OBJECT_DIR, 'pbmc_Darmanis_subset_9K.h5ad'))

In [7]:
# load the downsampled anndata object
adata = sc.read_h5ad('../data/pbmc_Darmanis/pbmc_Darmanis_subset_9K.h5ad')

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="cell_ranger", batch_key=batch_key)
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)
adata = adata[:, adata.var.highly_variable].copy()

In [17]:
adata

AnnData object with n_obs × n_vars = 8500 × 2000
    obs: 'kit', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type', 'predicted_celltype', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'dataset'
    var: 'feature_is_filtered', 'feature_reference', 'feature_biotype', 'gene_name-10X_3-rep1_annotated', 'highly_deviant-10X_3-rep1_annotated', 'feature_name-10X_3-rep1_annotated', 'feature_length-10X_3-rep1_annotated', 'feature_type-10X_3-rep1_annotated', 'gene_name-10X_3-rep2_annotated', 'highly_deviant-10X_3-rep2_annotated', 'feature_name-10X_3-rep2_annotated', 'feature_length-10X_3-rep2_annotated', 'feature_type-10X_3-rep2_annotated', 'gene_name-10X_5

In [18]:
# save the adata object
adata.write_h5ad('../data/pbmc_Darmanis/pbmc_Darmanis_subset_9K.h5ad')

----

# runtime and memory profiling

Use pbmc_Darmanis_subset_9K to test functions.

In [19]:
adata

AnnData object with n_obs × n_vars = 8500 × 2000
    obs: 'kit', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'tissue_type', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'donor_id', 'suspension_type', 'predicted_celltype', 'is_primary_data', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid', 'dataset'
    var: 'feature_is_filtered', 'feature_reference', 'feature_biotype', 'gene_name-10X_3-rep1_annotated', 'highly_deviant-10X_3-rep1_annotated', 'feature_name-10X_3-rep1_annotated', 'feature_length-10X_3-rep1_annotated', 'feature_type-10X_3-rep1_annotated', 'gene_name-10X_3-rep2_annotated', 'highly_deviant-10X_3-rep2_annotated', 'feature_name-10X_3-rep2_annotated', 'feature_length-10X_3-rep2_annotated', 'feature_type-10X_3-rep2_annotated', 'gene_name-10X_5

In [None]:
combined_keys = [
        # "unintegrated",
        "scanorama", 
        # "liger", 
        "harmony",
        # "scvi", "scanvi",
        "concord",
        # "concord_class", 
        # "concord_decoder", "contrastive"
    ]

In [35]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                   # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device=device,                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)



FAISS not found. Using sklearn for k-NN computation.


p_intra_knn: 0.3


Epoch 0 Training: 119it [00:04, 24.33it/s, loss=4.03]
Epoch 1 Training: 100%|██████████| 119/119 [00:04<00:00, 24.27it/s, loss=3.74]
Epoch 2 Training: 100%|██████████| 119/119 [00:05<00:00, 22.99it/s, loss=3.81]
Epoch 3 Training: 100%|██████████| 119/119 [00:05<00:00, 23.41it/s, loss=3.77]
Epoch 4 Training: 100%|██████████| 119/119 [00:05<00:00, 22.88it/s, loss=3.6] 
Epoch 5 Training: 100%|██████████| 119/119 [00:04<00:00, 24.37it/s, loss=3.78]
Epoch 6 Training: 100%|██████████| 119/119 [00:04<00:00, 24.48it/s, loss=3.58]
Epoch 7 Training: 100%|██████████| 119/119 [00:05<00:00, 23.72it/s, loss=3.78]
Epoch 8 Training: 100%|██████████| 119/119 [00:04<00:00, 24.52it/s, loss=3.56]
Epoch 9 Training: 100%|██████████| 119/119 [00:04<00:00, 24.24it/s, loss=3.69]




concord completed in 50.98 sec.
Running UMAP on concord...
✅ Selected methods completed.


In [24]:
time_log

{'concord': 55.854514360427856, 'liger': None, 'harmony': 43.668493032455444}

In [25]:
ram_log

{'concord': 0, 'liger': None, 'harmony': 184.62890625}

In [26]:
vram_log

{'concord': 0, 'liger': None, 'harmony': 0}