In [1]:
%load_ext autoreload
%autoreload 2

## Basic setup

In [2]:
import concord as ccd
import scanpy as sc
import torch
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

data_dir = Path('../data/CBCE')
data_dir.mkdir(parents=True, exist_ok=True)
import time
from pathlib import Path
proj_name = "CBCE"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
proj_name = "CBCE"
file_name = "CBCE"
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
file_suffix = f"{time.strftime('%b%d-%H%M')}"
seed = 0

In [3]:
adata = sc.read_h5ad('../data/CE_CB/adata_cbce_Jan30-1028.h5ad')

In [4]:
adata.obs['batch_fine'] = adata.obs['batch'].copy()
adata.obs['batch_broad'] = adata.obs['dataset3'].astype(str).copy()
adata.X = adata.layers["counts"].copy()
# Compute basic statistics
sc.pp.calculate_qc_metrics(adata, inplace=True)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

adata.obs['batch'] = adata.obs['batch_broad'].copy()
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=10000, subset=False)
sc.tl.pca(adata, n_comps=300, svd_solver='arpack', use_highly_variable=True)

adata.write_h5ad(data_dir / f"{proj_name}_{file_suffix}.h5ad") # Save the adata object with the encoded embeddings
print(f"Saved adata to {data_dir / f'{proj_name}_{file_suffix}.h5ad'}")

Saved adata to ../data/CBCE/CBCE_Jul16-1955.h5ad


### Run pipeline

In [None]:
adata = sc.read_h5ad("../data/CBCE/CBCE_Jul16-1955.h5ad")

In [None]:
adata = adata[:, adata.var.highly_variable].copy()
adata.write_h5ad(data_dir / f"{file_name}_preprocessed.h5ad")
print(f"✅ Preprocessed data saved to {data_dir / f'{file_name}_preprocessed.h5ad'}")

✅ Preprocessed data saved to ../data/CBCE/CBCE_preprocessed.h5ad


: 

In [None]:
ccd.ul.anndata_to_viscello(adata,
                        output_dir=data_dir / f"viscello_{proj_name}",
                        project_name=proj_name,
                        organism='cel')

#### Create jobs

In [3]:
adata = sc.read_h5ad(data_dir / f"{proj_name}_preprocessed.h5ad")

In [4]:
# Check if nan in adata.obs['batch'], if so show the rows with nan
if adata.obs['batch'].isna().any():
    print("Rows with NaN in 'batch':")
    print(adata.obs[adata.obs['batch'].isna()])

In [9]:
concord_args = {
        'element_mask_prob': 0.4, # Probability of masking features, recommended to be between 0.2 and 0.5
        'feature_mask_prob': 0.2, # Probability of masking features, recommended to be between 0.2 and 0.5
        'clr_temperature': 0.4, # Temperature for contrastive loss, recommended to be between 0.1 and 0.5
        'n_epochs': 15, # Number of epochs for training, adjust as needed
        'save_dir': '../'+str(save_dir) # Directory to save the model and results
    }

In [10]:
import subprocess, json
py_methods = ["scvi", "harmony", "scanorama", "liger", "unintegrated", "concord_hcl", "concord_knn", "contrastive"]
output_dir = '../jobs'
device = 'auto'
conda_env = 'cellpath'
#conda_env = 'concord'
batch_key = 'batch'
state_key = 'None'
latent_dim = '300'  # Adjust as needed, but should match the encoder_dims in concord_args
subprocess.run([
    "python", "./generate_py_jobs.py",
    "--proj_name", proj_name,
    "--adata_filename", f"{file_name}_preprocessed.h5ad",
    "--methods", *py_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--output_dir", output_dir,
    "--device", device,
    "--conda_env", conda_env,
    "--mem", "32G",  # Adjust memory as needed
    "--runtime", "1:00:00",
    "--concord_kwargs", json.dumps(concord_args),
    "--mode", "wynton"
])


✅ Generated: benchmark_CBCE/benchmark_CBCE_scvi.py
✅ Generated: benchmark_CBCE/benchmark_CBCE_scvi.sh

✅ Generated: benchmark_CBCE/benchmark_CBCE_harmony.py
✅ Generated: benchmark_CBCE/benchmark_CBCE_harmony.sh

✅ Generated: benchmark_CBCE/benchmark_CBCE_scanorama.py
✅ Generated: benchmark_CBCE/benchmark_CBCE_scanorama.sh

✅ Generated: benchmark_CBCE/benchmark_CBCE_liger.py
✅ Generated: benchmark_CBCE/benchmark_CBCE_liger.sh

✅ Generated: benchmark_CBCE/benchmark_CBCE_unintegrated.py
✅ Generated: benchmark_CBCE/benchmark_CBCE_unintegrated.sh

✅ Generated: benchmark_CBCE/benchmark_CBCE_concord_hcl.py
✅ Generated: benchmark_CBCE/benchmark_CBCE_concord_hcl.sh

✅ Generated: benchmark_CBCE/benchmark_CBCE_concord_knn.py
✅ Generated: benchmark_CBCE/benchmark_CBCE_concord_knn.sh

✅ Generated: benchmark_CBCE/benchmark_CBCE_contrastive.py
✅ Generated: benchmark_CBCE/benchmark_CBCE_contrastive.sh



CompletedProcess(args=['python', './generate_py_jobs.py', '--proj_name', 'CBCE', '--adata_filename', 'CBCE_preprocessed.h5ad', '--methods', 'scvi', 'harmony', 'scanorama', 'liger', 'unintegrated', 'concord_hcl', 'concord_knn', 'contrastive', '--batch_key', 'batch', '--state_key', 'None', '--latent_dim', '300', '--output_dir', '../jobs', '--device', 'auto', '--conda_env', 'cellpath', '--mem', '32G', '--runtime', '1:00:00', '--concord_kwargs', '{"element_mask_prob": 0.4, "feature_mask_prob": 0.2, "clr_temperature": 0.4, "n_epochs": 15, "save_dir": "../../save/dev_CBCE-Jul16"}', '--mode', 'wynton'], returncode=0)

In [11]:
proj_folder = Path(output_dir) / f"benchmark_{proj_name}"   # ../jobs/benchmark_<proj>
proj_folder.mkdir(exist_ok=True)                      # defensive

submit_all = proj_folder / f"submit_all_{proj_name}.sh"
with submit_all.open("w") as f:
    f.write("#!/bin/bash\n")
    f.write("# Auto-generated — submits every job for this project\n")
    f.write("# Run from this folder, or let the script cd into it.\n\n")
    f.write('cd "$(dirname "$0")"\n\n')          # ensures we’re in the right dir
    for sh_file in sorted(proj_folder.glob(f"benchmark_{proj_name}_*.sh")):
        f.write(f'qsub "{sh_file.name}"\n')

submit_all.chmod(0o755)
print(f"📌  Run “{submit_all}” to queue every job.")

📌  Run “../jobs/benchmark_CBCE/submit_all_CBCE.sh” to queue every job.


In [12]:
# Generate script for Seurat
import subprocess
r_methods = ["seurat_cca", "seurat_rpca"]
output_dir = '../jobs'
device = 'auto'
conda_env = 'cellpath'
batch_key = 'batch'
state_key = 'None'
latent_dim = '300' 
subprocess.run([
    "python", "./generate_seurat_script.py",
    "--proj_name", proj_name,
    "--eset_dir", '../'+ str(data_dir / f"viscello_{proj_name}"),   # <- folder w/ eset.rds
    "--methods", *r_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--mem", "250G",  # Adjust memory as needed
    "--runtime", "72:00:00",
    "--output_dir", output_dir,
    "--device", device,
    "--conda_env", conda_env
])

✅ Generated: benchmark_CBCE/benchmark_CBCE_seurat_cca.R
✅ Generated: benchmark_CBCE/benchmark_CBCE_seurat_cca.sh

✅ Generated: benchmark_CBCE/benchmark_CBCE_seurat_rpca.R
✅ Generated: benchmark_CBCE/benchmark_CBCE_seurat_rpca.sh



CompletedProcess(args=['python', './generate_seurat_script.py', '--proj_name', 'CBCE', '--eset_dir', '../../data/CBCE/viscello_CBCE', '--methods', 'seurat_cca', 'seurat_rpca', '--batch_key', 'batch', '--state_key', 'None', '--latent_dim', '300', '--mem', '250G', '--runtime', '72:00:00', '--output_dir', '../jobs', '--device', 'auto', '--conda_env', 'cellpath'], returncode=0)

In [9]:
# ------------------------------------------------------------------
# create submit_sequential_<proj>.sh  (runs each *.py job in order)
# ------------------------------------------------------------------
sequential_submit = proj_folder / f"submit_sequential_{proj_name}.sh"

sequential_template = f"""#!/usr/bin/env bash
set -euo pipefail
cd "$(dirname "$0")"            # work inside this folder (../jobs)
shopt -s nullglob

for job in benchmark_{proj_name}_*.py; do
  base=${{job%.py}}
  log="${{base}}.log"

  # ───────────────────────────────────────────────────────────────
  # skip if a previous run finished successfully
  # ───────────────────────────────────────────────────────────────
  if [[ -f "$log" ]] && grep -q "finished OK" "$log"; then
      echo ">>> SKIP $job  — already completed"
      continue
  fi

  echo ">>> $job   $(date)" | tee -a "$log"
  if python "$job" >>"$log" 2>&1; then
      echo ">>> finished OK" | tee -a "$log"
  else
      echo ">>> FAILED"      | tee -a "$log"
  fi
done
"""

sequential_submit.write_text(sequential_template)
sequential_submit.chmod(0o755)
print(f"📌  Run “{sequential_submit}” to queue jobs sequentially.")


📌  Run “../jobs/benchmark_CBCEcombineN2/submit_sequential_CBCEcombineN2.sh” to queue jobs sequentially.


### Collect results