# Benchmark

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import scanpy as sc
import time
from pathlib import Path
import torch
import concord as ccd
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import matplotlib as mpl

from matplotlib import font_manager, rcParams
custom_rc = {
    'font.family': 'Arial',  # Set the desired font for this plot
}

mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['pdf.fonttype'] = 42

  from pkg_resources import get_distribution, DistributionNotFound


In [3]:
proj_name = "dkd_Wilson"

# save_dir = f"../save/{proj_name}-{time.strftime('%b%d')}/"
# save_dir = Path(save_dir)
# save_dir.mkdir(parents=True, exist_ok=True)

data_dir = f"../data/"
data_dir = Path(data_dir)
data_dir.mkdir(parents=True, exist_ok=True)

device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(device)
seed = 0
ccd.ul.set_seed(seed)

file_suffix = f"{time.strftime('%b%d-%H%M')}"
file_suffix

cpu


'Jun15-0058'

In [4]:
# load the adata
adata = sc.read_h5ad(data_dir / "dkd_Wilson/dkd_Wilson.h5ad")

In [6]:
adata

AnnData object with n_obs × n_vars = 39176 × 36398
    obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rpl', 'percent.rps', 'doublet_id', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'


In [7]:
# save the raw counts to adata.layers
adata.layers["counts"] = adata.raw.X.copy()

In [8]:
# save (overwrite) the adata object with layers
# adata.write_h5ad(data_dir / "dkd_Wilson/dkd_Wilson.h5ad")


In [9]:
adata.obs['donor_id'].value_counts()

donor_id
healthy_6     8706
control_3     5278
healthy_5     3819
healthy_4     3707
control_1     3605
diabetic_1    2996
diabetic_3    2587
diabetic_4    2574
control_2     2552
diabetic_2    2468
diabetic_5     884
Name: count, dtype: int64

In [10]:
adata.obs['cell_type'].value_counts()

cell_type
epithelial cell of proximal tubule                           11478
kidney distal convoluted tubule epithelial cell               8469
kidney loop of Henle thick ascending limb epithelial cell     7749
renal principal cell                                          3295
renal alpha-intercalated cell                                 2029
endothelial cell                                              1702
parietal epithelial cell                                      1103
podocyte                                                      1069
kidney loop of Henle thin ascending limb epithelial cell       679
renal beta-intercalated cell                                   670
mesangial cell                                                 385
fibroblast                                                     338
leukocyte                                                      210
Name: count, dtype: int64

In [11]:
batch_key = 'donor_id'
state_key = 'cell_type'

In [None]:
# adata.obs[batch_key] = adata.obs[batch_key].astype("category")
# print(adata.obs[batch_key].dtype)


# preprocess data

In [None]:
# import scanpy as sc
# from pathlib import Path
# data_dir = Path(f"../data/")

In [12]:
adata

AnnData object with n_obs × n_vars = 39176 × 36398
    obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rpl', 'percent.rps', 'doublet_id', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'


In [13]:
sc.pp.highly_variable_genes(adata, n_top_genes=2000, flavor="seurat", batch_key=batch_key)
sc.tl.pca(adata, n_comps=30, use_highly_variable=True)

In [14]:
adata.var.highly_variable.value_counts()

highly_variable
False    34398
True      2000
Name: count, dtype: int64

In [15]:
adata = adata[:, adata.var.highly_variable].copy()

In [16]:
# save the adata
adata.write_h5ad(data_dir / "dkd_Wilson/dkd_Wilson_processed.h5ad")

In [17]:
adata

AnnData object with n_obs × n_vars = 39176 × 2000
    obs: 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'organism_ontology_term_id', 'sample_uuid', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'suspension_uuid', 'suspension_type', 'library_uuid', 'assay_ontology_term_id', 'mapped_reference_annotation', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'disease_ontology_term_id', 'reported_diseases', 'sex_ontology_term_id', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'percent.rpl', 'percent.rps', 'doublet_id', 'nCount_SCT', 'nFeature_SCT', 'seurat_clusters', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches

----

In [None]:
combined_keys = [
        "unintegrated",
        "scanorama", "liger", "harmony",
        "scvi", "scanvi",
        "concord",
        "concord_class", 
        "concord_decoder", "contrastive"
    ]

In [None]:
time_log, ram_log, vram_log = ccd.ul.run_integration_methods_pipeline(
    adata=adata,                          # Your input AnnData object
    methods=combined_keys,            # List of methods to run
    batch_key=batch_key,                    # Column in adata.obs for batch info
    count_layer="counts",                 # Layer name containing raw counts
    class_key=state_key,               # Column in adata.obs for class labels (used in SCANVI and CONCORD variants)
    latent_dim=30,                        # Latent dimensionality for PCA and embeddings
    device=device,                        # Or "cpu", or "mps" for Apple Silicon
    return_corrected=False,                   # Whether to store corrected expression matrices
    transform_batch=None,                 # Optionally specify a batch to transform to in scVI
    seed=42,                              # Random seed for reproducibility
    compute_umap=True,                    # Run UMAP for all output embeddings
    umap_n_components=2,
    umap_n_neighbors=30,
    umap_min_dist=0.5,
    verbose=True,                        # Print progress messages
)