# Imports and files

In [8]:
import os
from datetime import datetime
from pathlib import Path

import pandas as pd

In [9]:
data_path = Path("data")
analysis_path = Path("benchmark")

stereo_seq_file = data_path / "StereoSeq" / "Mouse_brain_Adult_GEM_bin1.tsv.gz"

signature_file = Path(".") / "yao_brain_signatures_log.tsv"
script_file = Path(".") / "sainsc_benchmark.py"

de_genes_file = data_path / "ABC_atlas" / "DE_genes.xlsx"
ficture_file = Path("FICTURE/minimal_nthread1/analysis/nF42.d_15") / "nF42.d_15.posterior.count.tsv.gz"

result_dir = Path("results")
performance_metric_file = result_dir / "benchmark_sainsc_genes_vs_signatures.tsv"

In [10]:
analysis_path.mkdir(exist_ok=True, parents=True)

In [11]:
n_threads = 8

In [12]:
conda_env = "sainsc"
conda_path = "~/miniconda3/bin/activate"

In [13]:
conda_cmd = f"source {conda_path} {conda_env}"

cmd = (
    f"{script_file.resolve()} {stereo_seq_file} {signature_file} "
    f"--n_threads {n_threads}"
)

In [8]:
gene_file = analysis_path / "genes_8k.txt"

pd.read_excel(
    de_genes_file, sheet_name="DE_gene_list", header=None, names=["gene"]
).loc[:, "gene"].to_csv(gene_file, header=False, index=False)

cmd_subset = cmd + f" --genes {gene_file}"

id_string = os.popen(
    "sbatch -J sainsc_8k --mem=128G -n 8 -N 1 "
    f"-o {analysis_path/'8k_genes_log.txt'} "
    "--time=48:00:00 "
    f'--wrap="{conda_cmd} && {cmd_subset}" '
).read()

print(id_string)

Submitted batch job 3769989



In [14]:
gene_file = analysis_path / "genes_ficture.txt"

pd.read_table(ficture_file, usecols=["gene"]).to_csv(
    gene_file, header=False, index=False
)

cmd_subset = cmd + f" --genes {gene_file}"

id_string = os.popen(
    "sbatch -J sainsc_ficture --mem=128G -n 8 -N 1 "
    f"-o {analysis_path/'ficture_genes_log.txt'} "
    "--time=48:00:00 "
    f'--wrap="{conda_cmd} && {cmd_subset}" '
).read()

print(id_string)

Submitted batch job 3774459



## How many genes from the signatures are used?

In [3]:
from sainsc import read_StereoSeq

In [4]:
brain = read_StereoSeq(stereo_seq_file)

In [5]:
signature_genes = pd.read_table(signature_file, index_col=0).index

In [6]:
# 8k ABC atlas DE genes
signature_genes.isin(
    pd.read_excel(
        de_genes_file, sheet_name="DE_gene_list", header=None, names=["gene"]
    )["gene"]
).sum()

7972

In [7]:
# Genes used by Ficture
signature_genes.isin(pd.read_table(ficture_file, usecols=["gene"])["gene"]).sum()

19708

# Scaling with #genes / #signatures

In [None]:
import random

from sainsc import read_StereoSeq

In [None]:
random.seed(42)

In [5]:
signatures = pd.read_table(signature_file, index_col=0)

In [7]:
brain = read_StereoSeq(stereo_seq_file)

In [9]:
signatures = signatures.loc[lambda df: df.index.isin(brain.genes)]

In [16]:
# sample gene sets
for i in range(7):
    n_genes = 250 * 2**i
    genes = random.sample(signatures.index.tolist(), n_genes)
    with open(analysis_path / f"genes_n{n_genes}.txt", "w") as f:
        f.write("\n".join(genes))

In [17]:
# sample signatures
for n_sig in [10, 15, 20, 30, 40]:
    celltypes = random.sample(signatures.columns.tolist(), n_sig)
    signatures.loc[:, celltypes].to_csv(
        analysis_path / f"sig_n{len(celltypes)}.tsv", sep="\t"
    )

In [18]:
n_threads = 8

In [22]:
conda_env = "sainsc"
conda_path = "~/miniconda3/bin/activate"

In [23]:
conda_cmd = f"source {conda_path} {conda_env}"

In [27]:
jobs = []

for i in range(7):
    for n_sig in [10, 15, 20, 30, 40]:
        n_genes = 250 * 2**i
        gene_file = analysis_path / f"genes_n{n_genes}.txt"
        signature_file = analysis_path / f"sig_n{n_sig}.tsv"

        cmd = (
            f"{script_file.resolve()} {stereo_seq_file} {signature_file} "
            f"--n_threads {n_threads} "
            f"--genes {gene_file}"
        )

        id_string = os.popen(
            "sbatch -J sainsc_benchmark --mem=64G -n 8 -N 1 "
            f"-o {analysis_path/f'{n_genes}genes_{n_sig}sigs_log.txt'} "
            "--time=6:00:00 "
            f'--wrap="{conda_cmd} && {cmd}" '
        ).read()

        jobs.append((id_string, n_genes, n_sig))

In [36]:
jobs = pd.DataFrame(jobs, columns=["jobid", "n_genes", "n_signatures"]).assign(
    jobid=lambda df: df["jobid"].str.extract("(\d+)\s$").astype(int)
)

In [38]:
jobs.to_csv(analysis_path / "sainsc_benchmark_jobs.tsv", sep="\t", index=False)

Wait until jobs finish successsfully.

In [57]:
from io import StringIO

In [61]:
jobs = jobs.set_index("jobid")

In [153]:
job_stats = os.popen(
    (
        "sacct "
        f"-j {','.join(jobs.index.astype(str))} "
        "--starttime 2024-07-1015:30:00 "
        "--format='JobID,Jobname%50,TotalCPU,ElapsedRaw,MaxRSS' "
        "-P "
        "--delimiter=$'\t' "
        "--units=M "
    )
).read()

job_stats = pd.read_table(StringIO(job_stats))

In [154]:
cpu_stats = (
    job_stats.loc[
        lambda df: ~df["JobID"].str.contains(".", regex=False),
        ["JobID", "JobName", "TotalCPU", "ElapsedRaw"],
    ]
    .assign(
        JobID=lambda df: df["JobID"].astype(int),
        TotalCPU=lambda df: ("00:" + df["TotalCPU"]).str.extract(
            "(\d{2}:\d{2}:\d{2}(?:\.\d+)?)$"
        ),
    )
    .assign(
        TotalCPU=lambda df: pd.to_timedelta(df["TotalCPU"]).dt.total_seconds(),
    )
    .set_index("JobID")
    .rename(columns={"TotalCPU": "CPU time [s]", "ElapsedRaw": "wall time [s]"})
)

In [155]:
memory_stats = (
    job_stats.loc[
        lambda df: df["JobID"].str.contains(".batch", regex=False), ["JobID", "MaxRSS"]
    ]
    .assign(
        JobID=lambda df: df["JobID"].str.extract("(\d+)").astype(int),
        MaxRSS=lambda df: df["MaxRSS"].str.extract("([\d\\.]+)").astype(float),
    )
    .set_index("JobID")
    .rename(columns={"MaxRSS": "max memory [MB]"})
)

In [156]:
stats = cpu_stats.join(memory_stats).join(jobs, how="inner").drop(columns=["JobName"])

In [157]:
stats.to_csv(performance_metric_file, sep="\t", index=False)