In [2]:
%load_ext autoreload
%autoreload 2

## Basic setup

In [3]:
import concord as ccd
import scanpy as sc
import torch
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')
import time
from pathlib import Path
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
seed = 0

In [4]:
import time
from pathlib import Path
proj_name = "pancreasv1_5k"
file_name = proj_name
file_suffix = time.strftime('%b%d-%H%M')
seed = 0

save_dir = Path(f"../save/{proj_name}")
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = Path(f"../data/{proj_name}")
data_dir.mkdir(parents=True, exist_ok=True)


In [5]:
adata = sc.read(
    data_dir / "dataset.h5ad"
)

In [6]:
adata.X = adata.layers["counts"].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [7]:
# Find highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=5000, flavor='seurat_v3', subset=False)

In [8]:
adata = adata[:, adata.var.highly_variable].copy()
adata.write_h5ad(data_dir / f"{file_name}_preprocessed_HVG.h5ad")
print(f"✅ Preprocessed data saved to {data_dir / f'{file_name}_preprocessed_HVG.h5ad'}")

✅ Preprocessed data saved to ../data/pancreasv1_5k/pancreasv1_5k_preprocessed_HVG.h5ad


In [5]:
adata = sc.read_h5ad(data_dir / f"{file_name}_preprocessed_HVG.h5ad")

In [9]:
import subprocess, json
py_methods = ["scvi", "harmony", "scanorama", "liger", "unintegrated", "concord_hcl", "concord_knn", "contrastive"]
output_dir = '../jobs'
device = 'auto'
conda_env = 'cellpath'
batch_key = "batch"
state_key = 'None'
latent_dim = '50'  # Adjust as needed, but should match the encoder_dims in concord_args
subprocess.run([
    "python", "./generate_py_jobs.py",
    "--proj_name", proj_name,
    "--adata_filename", f"{file_name}_preprocessed_HVG.h5ad",
    "--methods", *py_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--output_dir", output_dir,
    "--device", device,
    "--mem", "32G",  # Adjust memory as needed
    "--conda_env", conda_env,
    "--runtime", "1:30:00",
    "--mode", "wynton"
])


✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_scvi.py
✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_scvi.sh

✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_harmony.py
✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_harmony.sh

✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_scanorama.py
✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_scanorama.sh

✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_liger.py
✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_liger.sh

✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_unintegrated.py
✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_unintegrated.sh

✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_concord_hcl.py
✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_concord_hcl.sh

✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_concord_knn.py
✅ Generated: benchmark_pancreasv1_5k/bench

CompletedProcess(args=['python', './generate_py_jobs.py', '--proj_name', 'pancreasv1_5k', '--adata_filename', 'pancreasv1_5k_preprocessed_HVG.h5ad', '--methods', 'scvi', 'harmony', 'scanorama', 'liger', 'unintegrated', 'concord_hcl', 'concord_knn', 'contrastive', '--batch_key', 'batch', '--state_key', 'None', '--latent_dim', '50', '--output_dir', '../jobs', '--device', 'auto', '--mem', '32G', '--conda_env', 'cellpath', '--runtime', '1:30:00', '--mode', 'wynton'], returncode=0)

In [7]:
ccd.ul.anndata_to_viscello(adata,
                        output_dir=data_dir / f"viscello_{proj_name}",
                        project_name=proj_name,
                        organism='mmu')

VisCello project created at ../data/pancreasv1_5k/viscello_pancreasv1_5k


In [10]:
# Generate script for Seurat
import subprocess
r_methods = ["seurat_cca", "seurat_rpca"]
output_dir = '../jobs'
device = 'auto'
subprocess.run([
    "python", "./generate_seurat_script.py",
    "--proj_name", proj_name,
    "--eset_dir", '../'+ str(data_dir / f"viscello_{proj_name}"),   # <- folder w/ eset.rds
    "--methods", *r_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--mem", "80G",  # Adjust memory as needed
    "--runtime", "12:00:00",
    "--output_dir", output_dir,
    "--device", device,
    "--conda_env", conda_env
])

✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_seurat_cca.R
✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_seurat_cca.sh

✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_seurat_rpca.R
✅ Generated: benchmark_pancreasv1_5k/benchmark_pancreasv1_5k_seurat_rpca.sh



CompletedProcess(args=['python', './generate_seurat_script.py', '--proj_name', 'pancreasv1_5k', '--eset_dir', '../../data/pancreasv1_5k/viscello_pancreasv1_5k', '--methods', 'seurat_cca', 'seurat_rpca', '--batch_key', 'batch', '--state_key', 'None', '--latent_dim', '50', '--mem', '80G', '--runtime', '12:00:00', '--output_dir', '../jobs', '--device', 'auto', '--conda_env', 'cellpath'], returncode=0)

In [11]:
proj_folder = Path(output_dir) / f"benchmark_{proj_name}"   # ../jobs/benchmark_<proj>
proj_folder.mkdir(exist_ok=True)                      # defensive

submit_all = proj_folder / f"submit_all_{proj_name}.sh"
with submit_all.open("w") as f:
    f.write("#!/bin/bash\n")
    f.write("# Auto-generated — submits every job for this project\n")
    f.write("# Run from this folder, or let the script cd into it.\n\n")
    f.write('cd "$(dirname "$0")"\n\n')          # ensures we’re in the right dir
    for sh_file in sorted(proj_folder.glob(f"benchmark_{proj_name}_*.sh")):
        f.write(f'qsub "{sh_file.name}"\n')

submit_all.chmod(0o755)
print(f"📌  Run “{submit_all}” to queue every job.")

📌  Run “../jobs/benchmark_pancreasv1_5k/submit_all_pancreasv1_5k.sh” to queue every job.
