## PBMC atlas

https://www.nature.com/articles/s41587-023-01881-x#data-availability

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import concord as ccd
import scanpy as sc
import torch
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')
import time
from pathlib import Path
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
seed = 0

In [3]:
import time
from pathlib import Path
proj_name = "pbmc_atac"
file_name = proj_name
file_suffix = time.strftime('%b%d-%H%M')
seed = 0

save_dir = Path(f"../save/{proj_name}")
save_dir.mkdir(parents=True, exist_ok=True)

data_dir = Path(f"../data/{proj_name}")
data_dir.mkdir(parents=True, exist_ok=True)


In [4]:
adata = sc.read_h5ad(data_dir / "adata_fixedcells_consensus_processed_Jun07-1959.h5ad")
adata.shape

(169227, 198421)

In [5]:
# Load genotype txt file
import pandas as pd
genotype_res = pd.read_csv(data_dir / "genotype_concordance_unified.fixedcells_2_cistopic.txt", sep="\t")
# extract the technology, remove anything after 'FIXEDCELLS'
genotype_res['technology'] = genotype_res.index.str.replace(r'FIXEDCELLS.*', 'FIXEDCELLS', regex=True)
genotype_res['cell_name'] = genotype_res['BARCODE'] + "___" + genotype_res['technology']
# Check overlap between adata and genotype_res
overlap = adata.obs_names.intersection(genotype_res['cell_name'])
print(f"Number of overlapping cells: {len(overlap)}")
# Assign called sample to adata.obs
import numpy as np
adata.obs['called_sample'] = np.nan
adata.obs.loc[overlap, 'called_sample'] = genotype_res.set_index('cell_name').loc[overlap, 'sample'].values
adata.obs['called_sample'].value_counts()

Number of overlapping cells: 107176


called_sample
sampleA    54256
sampleB    52920
Name: count, dtype: int64

In [6]:
# Find highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=10000, flavor='seurat_v3', subset=False)
adata_filtered = adata[:, adata.var.highly_variable].copy()
adata_filtered = ccd.ul.filter_cells_min_genes(adata_filtered, min_genes=10)
adata_filtered.write_h5ad(data_dir / f"{file_name}_preprocessed_HVG.h5ad")
print(f"✅ Preprocessed data saved to {data_dir / f'{file_name}_preprocessed_HVG.h5ad'}")
# Filter original adata to match the cells in adata_filtered
adata = adata[adata_filtered.obs_names, :].copy()
adata.write_h5ad(data_dir / f"{file_name}_preprocessed.h5ad")


ℹ️  Keeping cells with ≥10 expressed genes (168817/169227 kept, 410 dropped).
✅ Preprocessed data saved to ../data/pbmc_atac/pbmc_atac_preprocessed_HVG.h5ad


### Create jobs

In [4]:
adata = sc.read_h5ad(data_dir / f"{file_name}_preprocessed_HVG.h5ad")

In [9]:
import subprocess, json
py_methods = ["scvi", "harmony", "scanorama", "liger", "unintegrated", "concord_hcl", "concord_knn", "contrastive"]
output_dir = '../jobs'
device = 'auto'
conda_env = 'cellpath'
batch_key = 'dataset'
state_key = 'None'
latent_dim = '50'  # Adjust as needed, but should match the encoder_dims in concord_args
subprocess.run([
    "python", "./generate_py_jobs.py",
    "--proj_name", proj_name,
    "--adata_filename", f"{file_name}_preprocessed_HVG.h5ad",
    "--methods", *py_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--output_dir", output_dir,
    "--device", device,
    "--mem", "32G",  # Adjust memory as needed
    "--conda_env", conda_env,
    "--runtime", "3:00:00",
    "--mode", "wynton"
])


✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_scvi.py
✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_scvi.sh

✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_harmony.py
✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_harmony.sh

✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_scanorama.py
✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_scanorama.sh

✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_liger.py
✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_liger.sh

✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_unintegrated.py
✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_unintegrated.sh

✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_concord_hcl.py
✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_concord_hcl.sh

✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_concord_knn.py
✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_concord_knn.sh

✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_contrastive.py
✅ Generat

CompletedProcess(args=['python', './generate_py_jobs.py', '--proj_name', 'pbmc_atac', '--adata_filename', 'pbmc_atac_preprocessed_HVG.h5ad', '--methods', 'scvi', 'harmony', 'scanorama', 'liger', 'unintegrated', 'concord_hcl', 'concord_knn', 'contrastive', '--batch_key', 'dataset', '--state_key', 'None', '--latent_dim', '50', '--output_dir', '../jobs', '--device', 'auto', '--mem', '32G', '--conda_env', 'cellpath', '--runtime', '3:00:00', '--mode', 'wynton'], returncode=0)

In [6]:
proj_folder = Path(output_dir) / f"benchmark_{proj_name}"   # ../jobs/benchmark_<proj>
proj_folder.mkdir(exist_ok=True)                      # defensive

submit_all = proj_folder / f"submit_all_{proj_name}.sh"
with submit_all.open("w") as f:
    f.write("#!/bin/bash\n")
    f.write("# Auto-generated — submits every job for this project\n")
    f.write("# Run from this folder, or let the script cd into it.\n\n")
    f.write('cd "$(dirname "$0")"\n\n')          # ensures we’re in the right dir
    for sh_file in sorted(proj_folder.glob(f"benchmark_{proj_name}_*.sh")):
        f.write(f'qsub "{sh_file.name}"\n')

submit_all.chmod(0o755)
print(f"📌  Run “{submit_all}” to queue every job.")

📌  Run “../jobs/benchmark_pbmc_atac/submit_all_pbmc_atac.sh” to queue every job.


In [8]:
ccd.ul.anndata_to_viscello(adata,
                        output_dir=data_dir / f"viscello_{proj_name}",
                        project_name=proj_name,
                        organism='hsa')

VisCello project created at ../data/pbmc_atac/viscello_pbmc_atac


In [10]:
# Generate script for Seurat
import subprocess
r_methods = ["seurat_rpca"]
output_dir = '../jobs'
device = 'auto'
subprocess.run([
    "python", "./generate_seurat_script.py",
    "--proj_name", proj_name,
    "--eset_dir", '../'+ str(data_dir / f"viscello_{proj_name}"),   # <- folder w/ eset.rds
    "--methods", *r_methods,
    "--batch_key", batch_key,
    "--state_key", state_key,
    "--latent_dim", latent_dim,
    "--mem", "200G",  # Adjust memory as needed
    "--runtime", "72:00:00",
    "--output_dir", output_dir,
    "--device", device,
    "--conda_env", conda_env
])

✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_seurat_rpca.R
✅ Generated: benchmark_pbmc_atac/benchmark_pbmc_atac_seurat_rpca.sh



CompletedProcess(args=['python', './generate_seurat_script.py', '--proj_name', 'pbmc_atac', '--eset_dir', '../../data/pbmc_atac/viscello_pbmc_atac', '--methods', 'seurat_rpca', '--batch_key', 'dataset', '--state_key', 'None', '--latent_dim', '50', '--mem', '200G', '--runtime', '72:00:00', '--output_dir', '../jobs', '--device', 'auto', '--conda_env', 'cellpath'], returncode=0)

In [11]:
proj_folder = Path(output_dir) / f"benchmark_{proj_name}"   # ../jobs/benchmark_<proj>
proj_folder.mkdir(exist_ok=True)                      # defensive

submit_all = proj_folder / f"submit_all_{proj_name}.sh"
with submit_all.open("w") as f:
    f.write("#!/bin/bash\n")
    f.write("# Auto-generated — submits every job for this project\n")
    f.write("# Run from this folder, or let the script cd into it.\n\n")
    f.write('cd "$(dirname "$0")"\n\n')          # ensures we’re in the right dir
    for sh_file in sorted(proj_folder.glob(f"benchmark_{proj_name}_*.sh")):
        f.write(f'qsub "{sh_file.name}"\n')

submit_all.chmod(0o755)
print(f"📌  Run “{submit_all}” to queue every job.")

📌  Run “../jobs/benchmark_pbmc_atac/submit_all_pbmc_atac.sh” to queue every job.
