# Benchmark

In [9]:
%load_ext autoreload
%autoreload 2

In [10]:
import os
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

In [11]:
proj_name = "pancreatic_islet_Hrovatin"

save_dir = f"../save/{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = f"../data/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)

file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix

cpu


'Jun13-0318'

In [12]:
# load the adata
adata = sc.read_h5ad(data_dir / "pancreatic_islet_Hrovatin/pancreatic_islet_Hrovatin.h5ad")

In [13]:
adata

AnnData object with n_obs × n_vars = 301796 × 31202
    obs: 'batch_integration', 'dataset', 'design', 'cell_cycle_phase', 'ins_high', 'gcg_high', 'sst_high', 'ppy_high', 'cell_filtering', 'strain', 'age', 'cell_type_originalDataset_unified', 'cell_type_originalDataset', 'cell_type_reannotatedIntegrated', 'n_genes', 'mt_frac', 'doublet_score', 'log10_n_counts', 'age_approxDays', 'dataset__design__sample', 'cell_subtype_immune_reannotatedIntegrated', 'cell_subtype_endothelial_reannotatedIntegrated', 'emptyDrops_LogProb_scaled', 'cell_subtype_beta_coarse_reannotatedIntegrated', 'cell_subtype_beta_fine_reannotatedIntegrated', 'GP_1', 'GP_2', 'GP_3', 'GP_4', 'GP_5', 'GP_6', 'GP_7', 'GP_8', 'GP_9', 'GP_10', 'GP_11', 'GP_12', 'GP_13', 'GP_14', 'GP_15', 'GP_16', 'GP_17', 'GP_18', 'GP_19', 'GP_20', 'GP_21', 'GP_22', 'GP_23', 'GP_24', 'GP_25', 'GP_26', 'GP_27', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'diabetes_model', 'chemical_stress', 'GEO

In [14]:
# save the raw counts to adata.layers
adata.layers["counts"] = adata.raw.X.copy()

In [15]:
# save (overwrite) the adata object with layers
adata.write_h5ad(data_dir / "pancreatic_islet_Hrovatin/pancreatic_islet_Hrovatin.h5ad")


In [16]:
adata.obs['batch_integration'].value_counts()

batch_integration
VSG_MUC13632                  19923
embryo_E15_5                  11303
embryo_E12_5                  11090
embryo_E14_5                  10089
STZ_G6                         9863
VSG_MUC13639                   9442
Fltp_P16_145_mGFP              9221
STZ_G2                         9184
VSG_MUC13641                   8788
VSG_MUC13640                   8787
VSG_MUC13642                   8439
STZ_G1                         8388
NOD_elimination_SRR7610298     8372
NOD_elimination_SRR7610301     7564
Fltp_2y_MUC13974               7534
STZ_G5                         7280
Fltp_P16_146_mRFP              7172
STZ_G4                         6292
NOD_elimination_SRR7610299     6271
spikein_drug_SRR10751502       6164
Fltp_2y_MUC13976               6081
NOD_elimination_SRR7610302     5988
NOD_elimination_SRR7610303     5972
NOD_elimination_SRR7610300     5765
VSG_MUC13634                   5633
spikein_drug_SRR10751506       5562
VSG_MUC13633                   5522
Fltp_adult

In [17]:
adata.obs['cell_type'].value_counts()

cell_type
type B pancreatic cell       102143
pancreatic A cell             40935
unknown                       36925
hematopoietic cell            31703
pancreatic D cell             24775
pancreatic stellate cell      23302
endothelial cell              13469
pancreatic endocrine cell     10853
pancreatic ductal cell         8742
pancreatic PP cell             6999
cell                            853
Schwann cell                    617
pancreatic acinar cell          480
Name: count, dtype: int64

In [18]:
batch_key = 'batch_integration'
state_key = 'cell_type'

In [None]:
# adata.obs[batch_key] = adata.obs[batch_key].astype("category")
# print(adata.obs[batch_key].dtype)


# preprocess data

In [19]:
adata

AnnData object with n_obs × n_vars = 301796 × 31202
    obs: 'batch_integration', 'dataset', 'design', 'cell_cycle_phase', 'ins_high', 'gcg_high', 'sst_high', 'ppy_high', 'cell_filtering', 'strain', 'age', 'cell_type_originalDataset_unified', 'cell_type_originalDataset', 'cell_type_reannotatedIntegrated', 'n_genes', 'mt_frac', 'doublet_score', 'log10_n_counts', 'age_approxDays', 'dataset__design__sample', 'cell_subtype_immune_reannotatedIntegrated', 'cell_subtype_endothelial_reannotatedIntegrated', 'emptyDrops_LogProb_scaled', 'cell_subtype_beta_coarse_reannotatedIntegrated', 'cell_subtype_beta_fine_reannotatedIntegrated', 'GP_1', 'GP_2', 'GP_3', 'GP_4', 'GP_5', 'GP_6', 'GP_7', 'GP_8', 'GP_9', 'GP_10', 'GP_11', 'GP_12', 'GP_13', 'GP_14', 'GP_15', 'GP_16', 'GP_17', 'GP_18', 'GP_19', 'GP_20', 'GP_21', 'GP_22', 'GP_23', 'GP_24', 'GP_25', 'GP_26', 'GP_27', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'diabetes_model', 'chemical_stress', 'GEO

In [20]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat", batch_key=batch_key)
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)

In [21]:
adata.var.highly_variable.value_counts()

highly_variable
False    29202
True      2000
Name: count, dtype: int64

In [22]:
adata = adata[:, adata.var.highly_variable].copy()

In [23]:
# save the adata
adata.write_h5ad(data_dir / "pancreatic_islet_Hrovatin/pancreatic_islet_Hrovatin_processed.h5ad")

In [3]:
adata

AnnData object with n_obs × n_vars = 301796 × 2000
    obs: 'batch_integration', 'dataset', 'design', 'cell_cycle_phase', 'ins_high', 'gcg_high', 'sst_high', 'ppy_high', 'cell_filtering', 'strain', 'age', 'cell_type_originalDataset_unified', 'cell_type_originalDataset', 'cell_type_reannotatedIntegrated', 'n_genes', 'mt_frac', 'doublet_score', 'log10_n_counts', 'age_approxDays', 'dataset__design__sample', 'cell_subtype_immune_reannotatedIntegrated', 'cell_subtype_endothelial_reannotatedIntegrated', 'emptyDrops_LogProb_scaled', 'cell_subtype_beta_coarse_reannotatedIntegrated', 'cell_subtype_beta_fine_reannotatedIntegrated', 'GP_1', 'GP_2', 'GP_3', 'GP_4', 'GP_5', 'GP_6', 'GP_7', 'GP_8', 'GP_9', 'GP_10', 'GP_11', 'GP_12', 'GP_13', 'GP_14', 'GP_15', 'GP_16', 'GP_17', 'GP_18', 'GP_19', 'GP_20', 'GP_21', 'GP_22', 'GP_23', 'GP_24', 'GP_25', 'GP_26', 'GP_27', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'diabetes_model', 'chemical_stress', 'GEO_

----

In [None]:
combined_keys = [
        "unintegrated",
        "scanorama", "liger", "harmony",
        "scvi", "scanvi",
        "concord",
        "concord_class", 
        "concord_decoder", "contrastive"
    ]

In [None]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                    # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device=device,                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)