In [1]:
%load_ext autoreload
%autoreload 2

## Basic setup

In [2]:
import concord as ccd
import scanpy as sc
import torch
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

data_dir = Path('../data/CBCEcombineN2')
data_dir.mkdir(parents=True, exist_ok=True)
import time
from pathlib import Path
proj_name = "CBCEcombineN2"
save_dir = f"../save/dev_{proj_name}-{time.strftime('%b%d')}/"
save_dir = Path(save_dir)
save_dir.mkdir(parents=True, exist_ok=True)
device = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
file_suffix = f"{time.strftime('%b%d-%H%M')}"
seed = 0

In [3]:
ccd.__version__

'1.0.6'

In [22]:
n2_adata = sc.read_h5ad('../data/celegans_binyamin/N2_outs/concord_celN2_Jun12-1457.h5ad')
cbce_adata = sc.read_h5ad('../data/CE_CB/adata_cbce_Jan30-1028.h5ad')

In [None]:
n2_adata.obs['embryo.time'] = n2_adata.obs['raw.embryo.time']
n2_adata.obs['batch'] = 'BZ_N2'
n2_adata.obs['batch_fine'] = 'BZ_N2'
n2_adata.obs['batch_broad'] = 'BZ_N2'
n2_adata.obs['species'] = 'C.elegans'
n2_adata.obs_names = [f"{name}-BZ_N2" for name in n2_adata.obs_names]

cbce_adata.obs['batch_fine'] = cbce_adata.obs['batch'].copy()
cbce_adata.obs['batch_broad'] = cbce_adata.obs['dataset3'].astype(str).copy()
adata = cbce_adata.concatenate(n2_adata, batch_key='lab', batch_categories=['Murray_CBCE','Gartner_BZ'])
adata.X = adata.layers["counts"].copy()
# Compute basic statistics
sc.pp.calculate_qc_metrics(adata, inplace=True)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

adata.obs['batch'] = adata.obs['batch_broad'].copy()
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=10000, subset=False)
sc.tl.pca(adata, n_comps=300, svd_solver='arpack', use_highly_variable=True)

adata.write_h5ad(data_dir / f"{proj_name}_{file_suffix}.h5ad") # Save the adata object with the encoded embeddings
print(f"Saved adata to {data_dir / f'{proj_name}_{file_suffix}.h5ad'}")

Saved adata to ../data/CBCEcombineN2/CBCEcombineN2_Jun28-1823.h5ad


In [None]:
# Save Packer dataset + N2 dataset separately
unique_batches = adata.obs['batch_broad'].unique()
print(f"Unique batches: {unique_batches}")
# If the batch name contains Waterston, Murray or BZ_N2, get it
filtered_batches = [batch for batch in unique_batches if 'Waterston' in batch or 'Murray' in batch or 'BZ_N2' in batch]
print(f"Filtered batches: {filtered_batches}")
adata_celsub = adata[adata.obs['batch_broad'].isin(filtered_batches)].copy()
print(f"adata_celsub shape: {adata_celsub.shape}")
adata_celsub.write_h5ad(data_dir / f"adata_celsub_{file_suffix}.h5ad")
print(f"Saved adata_celsub to {data_dir / f'adata_celsub_{file_suffix}.h5ad'}")

Unique batches: ['Waterston_300_minutes', 'Waterston_400_minutes', 'Waterston_500_1_minutes', 'Waterston_500_2_minutes', 'Ce_M03D44_300_minutes', ..., 'batch_300', 'batch_360', 'batch_400', 'batch_500', 'BZ_N2']
Length: 22
Categories (22, object): ['BZ_N2', 'Ce_M03D44_300_minutes', 'Ce_M03D44_500_minutes', 'Ce_ceh9_300_minutes', ..., 'batch_300', 'batch_360', 'batch_400', 'batch_500']
Filtered batches: ['Waterston_300_minutes', 'Waterston_400_minutes', 'Waterston_500_1_minutes', 'Waterston_500_2_minutes', 'Murray_b01', 'Murray_b02', 'Murray_r17', 'BZ_N2']
adata_celsub shape: (94276, 13405)


### Run pipeline

In [4]:
adata = sc.read_h5ad("../data/CBCEcombineN2/CBCEcombineN2_Jun28-1823.h5ad")

In [5]:
proj_name = "CBCEcombineN2"
file_name = "CBCEcombineN2"
file_suffix = time.strftime('%b%d-%H%M')
seed = 0

save_dir = Path(f"../save/{proj_name}")
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = Path(f"../data/{proj_name}")
data_dir.mkdir(parents=True, exist_ok=True)


In [22]:
adata = adata[:, adata.var.highly_variable].copy()
adata.write_h5ad(data_dir / f"{file_name}_preprocessed.h5ad")
print(f"✅ Preprocessed data saved to {data_dir / f'{file_name}_preprocessed.h5ad'}")

✅ Preprocessed data saved to ../data/CBCEcombineN2/CBCEcombineN2_preprocessed.h5ad


In [28]:
ccd.ul.anndata_to_viscello(adata,
                        output_dir=data_dir / f"viscello_{proj_name}",
                        project_name=proj_name,
                        organism='cel')

VisCello project created at ../data/CBCEcombineN2/viscello_CBCEcombineN2


In [5]:
adata = sc.read_h5ad(data_dir / f"{proj_name}_preprocessed.h5ad")

#### Create jobs

In [6]:
# Check if nan in adata.obs['batch'], if so show the rows with nan
if adata.obs['batch'].isna().any():
    print("Rows with NaN in 'batch':")
    print(adata.obs[adata.obs['batch'].isna()])

In [7]:
concord_args = {
        'batch_size':256, # Batch size for training, adjust as needed
        'encoder_dims':[1000], # Encoder dimensions, recommended to be larger than latent_dim
        'element_mask_prob': 0.4, # Probability of masking features, recommended to be between 0.2 and 0.5
        'feature_mask_prob': 0.2, # Probability of masking features, recommended to be between 0.2 and 0.5
        'clr_temperature': 0.4, # Temperature for contrastive loss, recommended to be between 0.1 and 0.5
        'sampler_knn': 1000, # Size of neighbohood for intra-neighborhood sampling
        'n_epochs': 15, # Number of epochs for training, adjust as needed
        'save_dir': '../'+str(save_dir) # Directory to save the model and results
    }

In [8]:
import subprocess, json
py_methods = ["scvi", "harmony", "scanorama", "liger", "unintegrated", "concord_hcl", "concord_knn", "contrastive"]
output_dir = '../jobs'
device = 'auto'
conda_env = 'cellpath'
batch_key = 'batch'
state_key = 'None'
latent_dim = '300'  # Adjust as needed, but should match the encoder_dims in concord_args
subprocess.run([
    "python", "./generate_py_jobs.py",
    "--proj_name", proj_name,
    "--adata_filename", f"{file_name}_preprocessed.h5ad",
    "--methods", *py_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--output_dir", output_dir,
    "--device", device,
    "--conda_env", conda_env,
    "--mem", "32G",  # Adjust memory as needed
    "--runtime", "1:00:00",
    "--concord_kwargs", json.dumps(concord_args),
    "--mode", "wynton"
])


✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_scvi.py
✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_scvi.sh

✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_harmony.py
✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_harmony.sh

✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_scanorama.py
✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_scanorama.sh

✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_liger.py
✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_liger.sh

✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_unintegrated.py
✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_unintegrated.sh

✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_concord_hcl.py
✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_concord_hcl.sh

✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_concord_knn.py
✅ Generated: benchmark_CBCEcombineN2/bench

CompletedProcess(args=['python', './generate_py_jobs.py', '--proj_name', 'CBCEcombineN2', '--adata_filename', 'CBCEcombineN2_preprocessed.h5ad', '--methods', 'scvi', 'harmony', 'scanorama', 'liger', 'unintegrated', 'concord_hcl', 'concord_knn', 'contrastive', '--batch_key', 'batch', '--state_key', 'None', '--latent_dim', '300', '--output_dir', '../jobs', '--device', 'auto', '--conda_env', 'cellpath', '--mem', '32G', '--runtime', '1:00:00', '--concord_kwargs', '{"batch_size": 256, "encoder_dims": [1000], "element_mask_prob": 0.4, "feature_mask_prob": 0.2, "clr_temperature": 0.4, "sampler_knn": 1000, "n_epochs": 15, "save_dir": "../../save/CBCEcombineN2"}', '--mode', 'wynton'], returncode=0)

In [27]:
proj_folder = Path(output_dir) / f"benchmark_{proj_name}"   # ../jobs/benchmark_<proj>
proj_folder.mkdir(exist_ok=True)                      # defensive

submit_all = proj_folder / f"submit_all_{proj_name}.sh"
with submit_all.open("w") as f:
    f.write("#!/bin/bash\n")
    f.write("# Auto-generated — submits every job for this project\n")
    f.write("# Run from this folder, or let the script cd into it.\n\n")
    f.write('cd "$(dirname "$0")"\n\n')          # ensures we’re in the right dir
    for sh_file in sorted(proj_folder.glob(f"benchmark_{proj_name}_*.sh")):
        f.write(f'qsub "{sh_file.name}"\n')

submit_all.chmod(0o755)
print(f"📌  Run “{submit_all}” to queue every job.")

📌  Run “../jobs/benchmark_CBCEcombineN2/submit_all_CBCEcombineN2.sh” to queue every job.


In [None]:
# Generate script for Seurat
import subprocess
r_methods = ["seurat_cca", "seurat_rpca"]
output_dir = '../jobs'
device = 'auto'
conda_env = 'cellpath'
batch_key = 'batch'
state_key = 'None'
latent_dim = '300' 
subprocess.run([
    "python", "./generate_seurat_script.py",
    "--proj_name", proj_name,
    "--eset_dir", '../'+ str(data_dir / f"viscello_{proj_name}"),   # <- folder w/ eset.rds
    "--methods", *r_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--mem", "250G",  # Adjust memory as needed
    "--runtime", "72:00:00",
    "--output_dir", output_dir,
    "--device", device,
    "--conda_env", conda_env
])

✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_seurat_cca.R
✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_seurat_cca.sh

✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_seurat_rpca.R
✅ Generated: benchmark_CBCEcombineN2/benchmark_CBCEcombineN2_seurat_rpca.sh



CompletedProcess(args=['python', './generate_seurat_script.py', '--proj_name', 'CBCEcombineN2', '--eset_dir', '../../data/CBCEcombineN2/viscello_CBCEcombineN2', '--methods', 'seurat_cca', 'seurat_rpca', '--batch_key', 'batch', '--state_key', 'None', '--latent_dim', '300', '--mem', '250G', '--runtime', '72:00:00', '--output_dir', '../jobs', '--device', 'auto', '--conda_env', 'cellpath'], returncode=0)

### Collect results