In [1]:
import os
from datetime import datetime
from pathlib import Path

import anndata as ad
import scanpy as sc

In [2]:
# inputs
data_path = Path("data")
stereo_seq_file = data_path / "StereoSeq" / "Mouse_brain_Adult_GEM_bin1.tsv.gz"
training_data = data_path / "Yao_subsampled.h5ad"
script_file = Path(".") / "TopACT.py"

# number of proceses
n_proc = 8

# outputs
topact_path = Path("TopACT")

training_path = topact_path / "training.h5ad"

In [3]:
topact_path.mkdir(parents=True, exist_ok=True)

In [4]:
sc_adata = ad.read_h5ad(training_data)

# subsample to 1,000 cells per cell type
target_cells = 1_000

subsampled = []
for ct in sc_adata.obs["subclass_label"].cat.categories:
    subsampled.append(sc_adata[sc_adata.obs["subclass_label"] == ct])
    if subsampled[-1].n_obs > target_cells:
        sc.pp.subsample(subsampled[-1], n_obs=target_cells, random_state=42)

sc_adata = ad.concat(subsampled)

sc_adata.write_h5ad(training_path)

print(sc_adata)

AnnData object with n_obs × n_vars = 37980 × 31053
    obs: 'supertype_label', 'subclass_label'


In [4]:
conda_env = "topact"
conda_path = "~/miniconda3/bin/activate"

conda_cmd = f"source {conda_path} {conda_env}"

In [5]:
partition = "-p compute-96cpu-700GB-RAM"

In [7]:
out_path = topact_path / "analysis"
out_path.mkdir(parents=True, exist_ok=True)

cmd = (
    f"{script_file.resolve()} {training_path} {stereo_seq_file} {out_path} "
    f"--n_processes {n_proc}"
)

id_string = os.popen(
    f"sbatch -J TopACT --mem=128G -n {n_proc} -N 1 "
    f"-o {out_path/'log.txt'} "
    "--time=7-00:00:00 "
    "--exclusive "
    f"{partition} "
    f'--wrap="{conda_cmd} && {cmd}" '
).read()

print(id_string)

Submitted batch job 3791297

