# Benchmark

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

  from pkg_resources import get_distribution, DistributionNotFound


In [3]:
proj_name = "HypoMap_Steuernagel"
save_dir = f"../save/{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = f"../data/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)

file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix

cuda:3


'Jun13-0001'

In [4]:
# load the adata
adata = sc.read_h5ad(data_dir / "HypoMap_Steuernagel/HypoMap_Steuernagel.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 384925 × 51676
    obs: 'SRA_ID', 'Sample_ID', 'organism_ontology_term_id', 'donor_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'Dataset', 'Batch_ID', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'C7_named', 'C25_named', 'C66_named', 'C185_named', 'C286_named', 'C465_named', 'Author_Class_Curated', 'Author_CellType', 'Region_summarized', 'is_primary_data', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'features', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'batch_condition', 'citation', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_umap'

In [None]:
# save the raw counts to adata.layers
# adata.layers["counts"] = adata.raw.X.copy()

In [9]:
# save (overwrite) the adata object with layers
adata.write_h5ad(data_dir / "HypoMap_Steuernagel/HypoMap_Steuernagel.h5ad")


In [10]:
adata.obs['Batch_ID'].value_counts()

Batch_ID
Anderson10x_batch_1         60756
Kim10x_batch_1              40599
Rupp10x_batch_1             33502
Dowsett10xnuc_batch_1       24759
wen10x_batch_1              17483
wenDropseq_batch_1          16181
CampbellDropseq_batch_1     15641
Moffit10x_batch_3           14958
Morris10x_batch_1           13660
ChenDropseq_batch_1         13329
Morris10x_batch_3           11798
Mousebrainorg10x_batch_2    11747
Dowsett10xnuc_batch_2       11186
Mousebrainorg10x_batch_1     9674
RossiDropseq_batch_1         9292
Moffit10x_batch_2            8892
Affinati10x_batch_1          8477
Mickelsen10x_batch_1         8441
Affinati10x_batch_4          8404
Flynn10x_batch_1             7489
kimDev10x_batch_1            6912
Affinati10x_batch_3          6056
Flynn10x_batch_2             5864
Morris10x_batch_2            5492
CampbellDropseq_batch_2      5248
Moffit10x_batch_1            4185
LeeDropseq_batch_1           2212
RomanovDev10x_batch_1        2152
Affinati10x_batch_2           536
Name:

In [11]:
adata.obs['cell_type'].value_counts()

cell_type
neuron                            219360
astrocyte                          52186
oligodendrocyte                    48817
oligodendrocyte precursor cell     19865
microglial cell                    14304
endothelial cell                   10261
tanycyte                            9737
ependymal cell                      4425
mural cell                          3776
fibroblast                          1372
pituitary gland cell                 729
hypendymal cell                       52
erythrocyte                           41
Name: count, dtype: int64

In [7]:
batch_key = 'Batch_ID'
state_key = 'cell_type'

In [27]:
# adata.obs[batch_key] = adata.obs[batch_key].astype("category")
# print(adata.obs[batch_key].dtype)


# preprocess data

In [4]:
adata = sc.read_h5ad(data_dir / "HypoMap_Steuernagel/HypoMap_Steuernagel.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 384925 × 51676
    obs: 'SRA_ID', 'Sample_ID', 'organism_ontology_term_id', 'donor_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'Dataset', 'Batch_ID', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'C7_named', 'C25_named', 'C66_named', 'C185_named', 'C286_named', 'C465_named', 'Author_Class_Curated', 'Author_CellType', 'Region_summarized', 'is_primary_data', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'features', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'batch_condition', 'citation', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_umap'
    layers: 'counts'

In [8]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat", batch_key=batch_key)
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)

In [9]:
adata.var.highly_variable.value_counts()

highly_variable
False    49676
True      2000
Name: count, dtype: int64

In [10]:
adata = adata[:, adata.var.highly_variable].copy()

In [11]:
# save the adata
adata.write_h5ad(data_dir / "HypoMap_Steuernagel/HypoMap_Steuernagel_processed.h5ad")

In [12]:
adata

AnnData object with n_obs × n_vars = 384925 × 2000
    obs: 'SRA_ID', 'Sample_ID', 'organism_ontology_term_id', 'donor_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'suspension_type', 'Dataset', 'Batch_ID', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'C7_named', 'C25_named', 'C66_named', 'C185_named', 'C286_named', 'C465_named', 'Author_Class_Curated', 'Author_CellType', 'Region_summarized', 'is_primary_data', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'features', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns:

----

In [31]:
combined_keys = [
        "unintegrated",
        "scanorama", "liger", "harmony",
        "scvi", "scanvi",
        "concord",
        "concord_class", 
        "concord_decoder", "contrastive"
    ]

In [None]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                    # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device=device,                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)