# Benchmark

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

  from pkg_resources import get_distribution, DistributionNotFound


In [3]:
proj_name = "endothelium_subset_TabulaSapiens"

save_dir = f"../save/{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = f"../data/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)

file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix

cpu


'Jun13-0330'

In [4]:
# load the adata
adata = sc.read_h5ad(data_dir / "endothelium_subset_TabulaSapiens/endothelium_subset_TabulaSapiens.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 73195 × 61759
    obs: 'donor_id', 'tissue_in_publication', 'anatomical_position', 'method', 'cdna_plate', 'library_plate', 'notes', 'cdna_well', 'assay_ontology_term_id', 'sample_id', 'replicate', '10X_run', 'ambient_removal', 'donor_method', 'donor_assay', 'donor_tissue', 'donor_tissue_assay', 'cell_type_ontology_term_id', 'compartment', 'broad_cell_class', 'free_annotation', 'manually_annotated', 'published_2022', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ercc', 'pct_counts_ercc', '_scvi_batch', '_scvi_labels', 'scvi_leiden_donorassay_full', 'ethnicity_original', 'scvi_leiden_res05_compartment', 'sample_number', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'disease_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', 

In [6]:
# save the raw counts to adata.layers
adata.layers["counts"] = adata.raw.X.copy()

In [7]:
# save (overwrite) the adata object with layers
adata.write_h5ad(data_dir / "endothelium_subset_TabulaSapiens/endothelium_subset_TabulaSapiens.h5ad")


In [8]:
adata.obs['donor_id'].value_counts()

donor_id
TSP2     13938
TSP14    11560
TSP25    10552
TSP21    10370
TSP27     5786
TSP4      3934
TSP12     3324
TSP10     3225
TSP1      2744
TSP9      2029
TSP30     1063
TSP6       902
TSP8       822
TSP7       791
TSP19      719
TSP15      624
TSP28      434
TSP17      191
TSP26       98
TSP3        56
TSP5        33
Name: count, dtype: int64

In [9]:
adata.obs['cell_type'].value_counts()

cell_type
endothelial cell                         33700
capillary endothelial cell               15298
cardiac endothelial cell                 10092
vein endothelial cell                     4698
endothelial cell of artery                3471
endothelial cell of lymphatic vessel      2438
retinal blood vessel endothelial cell     2091
endothelial cell of vascular tree          875
colon endothelial cell                     348
endothelial cell of arteriole              132
endothelial cell of venule                  52
Name: count, dtype: int64

In [10]:
batch_key = 'donor_id'
state_key = 'cell_type'

In [None]:
# adata.obs[batch_key] = adata.obs[batch_key].astype("category")
# print(adata.obs[batch_key].dtype)


# preprocess data

In [None]:
# adata = sc.read_h5ad(data_dir / "endothelium_subset_TabulaSapiens/endothelium_subset_TabulaSapiens.h5ad")

In [None]:
adata

In [11]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat", batch_key=batch_key)
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)

In [12]:
adata.var.highly_variable.value_counts()

highly_variable
False    59759
True      2000
Name: count, dtype: int64

In [13]:
adata = adata[:, adata.var.highly_variable].copy()

In [14]:
# save the adata
adata.write_h5ad(data_dir / "endothelium_subset_TabulaSapiens/endothelium_subset_TabulaSapiens_processed.h5ad")

In [15]:
adata

AnnData object with n_obs × n_vars = 73195 × 2000
    obs: 'donor_id', 'tissue_in_publication', 'anatomical_position', 'method', 'cdna_plate', 'library_plate', 'notes', 'cdna_well', 'assay_ontology_term_id', 'sample_id', 'replicate', '10X_run', 'ambient_removal', 'donor_method', 'donor_assay', 'donor_tissue', 'donor_tissue_assay', 'cell_type_ontology_term_id', 'compartment', 'broad_cell_class', 'free_annotation', 'manually_annotated', 'published_2022', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ercc', 'pct_counts_ercc', '_scvi_batch', '_scvi_labels', 'scvi_leiden_donorassay_full', 'ethnicity_original', 'scvi_leiden_res05_compartment', 'sample_number', 'organism_ontology_term_id', 'suspension_type', 'tissue_type', 'disease_ontology_term_id', 'is_primary_data', 'tissue_ontology_term_id', 'sex_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'cell_type', 'assay', 'disease', 'organism', 'sex', '

----

In [None]:
combined_keys = [
        "unintegrated",
        "scanorama", "liger", "harmony",
        "scvi", "scanvi",
        "concord",
        "concord_class", 
        "concord_decoder", "contrastive"
    ]

In [None]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                    # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device=device,                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)