# Benchmark

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

  from pkg_resources import get_distribution, DistributionNotFound


In [3]:
proj_name = "immune_DominguezConde"

save_dir = f"../save/{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = f"../data/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)

file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix

cuda:3


'Jun13-0038'

In [4]:
# load the adata
adata = sc.read_h5ad(data_dir / "immune_DominguezConde/immune_DominguezConde.h5ad")

In [13]:
adata

AnnData object with n_obs × n_vars = 329762 × 36398
    obs: 'donor_id', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Majority_voting_CellTypist_high', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'sex_ontology_term_id_colors', 'title'
    obsm: 'X_umap'
    layers: 'counts'

In [None]:
# save the raw counts to adata.layers
# adata.layers["counts"] = adata.raw.X.copy()

In [8]:
# save (overwrite) the adata object with layers
adata.write_h5ad(data_dir / "immune_DominguezConde/immune_DominguezConde.h5ad")


In [10]:
adata.obs['donor_id'].value_counts()

donor_id
D496    88057
D503    79004
640C    35527
637C    25843
A36     24105
A29     17327
A31     12446
582C    11590
A35     11105
621B    10632
A37      9806
A52      4320
Name: count, dtype: int64

In [9]:
adata.obs['cell_type'].value_counts()

cell_type
naive thymus-derived CD4-positive, alpha-beta T cell                          37613
memory B cell                                                                 30124
CD8-positive, alpha-beta memory T cell                                        25519
classical monocyte                                                            21847
CD16-positive, CD56-dim natural killer cell, human                            20591
effector memory CD4-positive, alpha-beta T cell                               19869
alveolar macrophage                                                           17238
CD4-positive helper T cell                                                    16099
T follicular helper cell                                                      15293
effector memory CD8-positive, alpha-beta T cell, terminally differentiated    14612
naive B cell                                                                  13998
CD8-positive, alpha-beta memory T cell, CD45RO-positive           

In [3]:
batch_key = 'donor_id'
state_key = 'cell_type'

In [None]:
# adata.obs[batch_key] = adata.obs[batch_key].astype("category")
# print(adata.obs[batch_key].dtype)


# preprocess data

In [2]:
import scanpy as sc
from pathlib import Path
data_dir = Path(f"../data/")

In [4]:
adata = sc.read_h5ad(data_dir / "immune_DominguezConde/immune_DominguezConde.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 329762 × 36398
    obs: 'donor_id', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Majority_voting_CellTypist_high', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'sex_ontology_term_id_colors', 'title'
    obsm: 'X_umap'
    layers: 'counts'

In [6]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat", batch_key=batch_key)
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)

  mask_var_param, mask_var = _handle_mask_var(adata, mask_var, use_highly_variable)


In [7]:
adata.var.highly_variable.value_counts()

highly_variable
False    34398
True      2000
Name: count, dtype: int64

In [8]:
adata = adata[:, adata.var.highly_variable].copy()

In [9]:
# save the adata
adata.write_h5ad(data_dir / "immune_DominguezConde/immune_DominguezConde_processed.h5ad")

In [10]:
adata

AnnData object with n_obs × n_vars = 329762 × 2000
    obs: 'donor_id', 'Predicted_labels_CellTypist', 'Majority_voting_CellTypist', 'Majority_voting_CellTypist_high', 'Manually_curated_celltype', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'sex_ontology_term_id', 'tissue_ontology_term_id', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'gene_symbols', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'default_embedding', 'schema_reference', 'schema_versio

----

In [None]:
combined_keys = [
        "unintegrated",
        "scanorama", "liger", "harmony",
        "scvi", "scanvi",
        "concord",
        "concord_class", 
        "concord_decoder", "contrastive"
    ]

In [None]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                    # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device=device,                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)