# Slurm Job Submission

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>).
- **Affiliation:** Helmholtz Munich, Institute of Computational Biology (ICB), Talavera-López Lab
- **Date of Creation:** 20.03.2023
- **Date of Last Modification:** 13.06.2023

## 1. Setup

### 1.1 Import Libraries

In [1]:
import os

### 1.2 Define Parameters

In [2]:
conda_env_name = "nichecompass"

### 1.3 Define Functions

In [3]:
def submit_python_script(
        job_name_prefix,
        job_id,
        job_folder_path,
        conda_env_name,
        script_folder_path,
        script_name,
        script_args,
        nice=10000):
    job_name = f"{job_name_prefix}_{job_id}"
    # Account for fact that submit node has different home path than compute node
    job_file_path = f"{job_folder_path.replace('/aih', '')}/job_{job_name}.cmd"
    out_file_path = f"{job_folder_path}/logs/out_{job_name}.txt"
    err_file_path = f"{job_folder_path}/logs/err_{job_name}.txt"
    
    os.makedirs(job_folder_path + "/logs", exist_ok=True)
    
    with open(job_file_path, "w") as handle:
        handle.writelines("#!/bin/bash\n")
        handle.writelines(f"#SBATCH -J {job_name}\n")
        handle.writelines(f"#SBATCH -o {out_file_path}\n")
        handle.writelines(f"#SBATCH -e {err_file_path}\n")
        handle.writelines("#SBATCH -t 48:00:00\n")
        handle.writelines("#SBATCH -p gpu_p\n")
        handle.writelines("#SBATCH -c 6\n")
        handle.writelines("#SBATCH --gres=gpu:1\n")
        handle.writelines("#SBATCH --qos=gpu\n")       
        handle.writelines("#SBATCH --mem=64GB\n")
        handle.writelines(f"#SBATCH --nice={nice}\n")
        handle.writelines("source $HOME/.bashrc\n")
        handle.writelines(f"conda activate {conda_env_name}\n")
        handle.writelines("cd /\n")
        handle.writelines(f"cd {script_folder_path}\n")
        handle.writelines(f"python ../{script_name}")
        handle.writelines(f"{script_args}")
        handle.writelines("\n")
        
    os.system(f"sbatch {job_file_path}")

## 2. NicheCompass Reference Model Training

### 2.1 seqFISH Mouse Organogenesis Imputed

In [None]:
task = "reference"
dataset = "seqfish_mouse_organogenesis_imputed"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
n_neighbors = 12
n_hvg = 4000
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = "1024"

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --filter_genes" \
              f" --n_hvg {n_hvg}" \
              " --nichenet_keep_target_genes_ratio 0.01" \
              " --nichenet_max_n_target_genes_per_gp 1000" \
              " --include_mebocost_gps" \
              f" --mebocost_species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              " --cond_embed_injection encoder gene_expr_decoder" \
              " --n_cond_embed 20" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              " --lambda_cond_contrastive 0." \
              " --contrastive_logits_ratio 0." \
              " --lambda_group_lasso 0." \
              " --lambda_l1_masked 5." \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 2.2 STARmap PLUS Mouse Central Nervous System

In [52]:
task = "reference"
dataset = "starmap_plus_mouse_cns"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" \
                    " batch9 batch10 batch11 batch12 batch13 batch14 batch15 batch16 batch17 batch18" \
                    " batch19 batch20"
reference_batches = "batch1 batch2 batch3"
n_neighbors = 12 # 8, 12
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 2048 # 2048
cond_embed_injection = "encoder gene_expr_decoder"
n_cond_embed = 3 # 3, 20, None
lambda_cond_contrastive = 0. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0. # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cond_embed_injection {cond_embed_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12208376


In [36]:
task = "reference"
dataset = "starmap_plus_mouse_cns"
job_id = 3
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" \
                    " batch9 batch10 batch11 batch12 batch13 batch14 batch15 batch16 batch17 batch18" \
                    " batch19 batch20"
n_neighbors = 12 # 8, 12
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 2048
cond_embed_injection = "encoder gene_expr_decoder"
n_cond_embed = 20 # 3, 20, None
lambda_cond_contrastive = 250000. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0.03125 # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cond_embed_injection {cond_embed_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12225823


### 2.3 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

In [11]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8"
reference_batches = "batch1 batch2 batch3"
n_neighbors = 4 # 4, 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 4096 # 4096, 2048, 512
cond_embed_injection = "encoder gene_expr_decoder"
n_cond_embed = 3 # 3, 8
cat_covariates_keys = "fov"
num_cat_covariates_embed = "20"
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cond_embed_injection {cond_embed_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              f" --num_cat_covariates_embed {num_cat_covariates_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12223274


In [10]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8"
n_neighbors = 4 # 4, 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 4096 # 4096, 2048, 512
cond_embed_injection = "encoder gene_expr_decoder"
n_cond_embed = 8
lambda_cond_contrastive = 100000.
contrastive_logits_pos_ratio = 0.125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cond_embed_injection {cond_embed_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12223181


In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8"
n_neighbors = 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = "1024"

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 0.01" \
              " --nichenet_max_n_target_genes_per_gp 1000" \
              " --include_mebocost_gps" \
              f" --mebocost_species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              " --cond_embed_injection encoder gene_expr_decoder" \
              " --n_cond_embed 8" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              " --lambda_cond_contrastive 0." \
              " --contrastive_logits_ratio 0." \
              " --lambda_group_lasso 0." \
              " --lambda_l1_masked 5." \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 2.4 Vizgen MERFISH Human Ovarian Cancer

In [4]:
task = "reference"
dataset = "vizgen_merfish_human_ovarian_cancer"
job_id = 3
reference_batches = "batch1 batch2 batch3 batch4"
n_neighbors = 4 # 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 4096 # 2048
cond_embed_injection = "encoder gene_expr_decoder"
n_cond_embed = 4 # 3, 20, None
lambda_cond_contrastive = 250000. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0.125 # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cond_embed_injection {cond_embed_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12222172


### 2.4 Vizgen MERFISH Human Lung Cancer

In [5]:
task = "reference"
dataset = "vizgen_merfish_human_lung_cancer"
job_id = 1
reference_batches = "batch1 batch2"
n_neighbors = 4 # 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 4096 # 2048
cond_embed_injection = "encoder gene_expr_decoder"
n_cond_embed = 2 # 3, 20, None
lambda_cond_contrastive = 250000. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0.125 # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cond_embed_injection {cond_embed_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12222718


### 2.5 Spatial ATAC-RNA-Seq Mouse Embryo & Brain

In [None]:
task = "reference"
dataset = "spatial_atac_rna_seq_mouse_brain"
reference_batches = "None"
n_neighbors = 12
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = "4096"

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --filter_genes" \
              " --n_hvg 3000" \
              " --nichenet_keep_target_genes_ratio 0.01" \
              " --nichenet_max_n_target_genes_per_gp 1000" \
              " --include_mebocost_gps" \
              f" --mebocost_species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              " --include_atac_modality" \
              " --filter_peaks" \
              " --min_cell_peak_thresh_ratio 0.0005" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              " --cond_embed_injection encoder gene_expr_decoder" \
              " --n_cond_embed 4" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              " --lambda_chrom_access_recon 100." \
              " --lambda_cond_contrastive 0." \
              " --contrastive_logits_ratio 0." \
              " --lambda_group_lasso 0." \
              " --lambda_l1_masked 5." \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 2.6 Xenium Human Breast Cancer

In [9]:
task = "reference"
dataset = "xenium_human_breast_cancer"
job_id = 2
reference_batches = "batch1 batch2"
n_neighbors = 4 # 4, 8, 12
species = "human"
node_label_method = "one-hop-attention"
edge_batch_size = 4096 # 4096 (4, 8), 1024 (12)
cond_embed_injection = "encoder gene_expr_decoder"
n_cond_embed = 2
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0. # 0., 5., 10.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cond_embed_injection {cond_embed_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12222799


## 3. NicheCompass Query Mapping on Reference Model

### 3.1 seqFISH Mouse Organogenesis Imputed

In [22]:
task = "reference"
dataset = "seqfish_mouse_organogenesis_imputed"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4"
n_neighbors = 12
n_hvg = 3000
species = "mouse"
node_label_method = "one-hop-attention"
edge_batch_size = 1024
cond_embed_injection = "encoder gene_expr_decoder"
n_cond_embed = 4
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --filter_genes" \
              f" --n_hvg {n_hvg}" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cond_embed_injection {cond_embed_injection}" \
              f" --n_cond_embed {n_cond_embed}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cond_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12224333


In [None]:
job_name_prefix = "nichecompass_nanostring_cosmx_human_liver_query"
job_id = 1
job_folder_path = "/home/aih/sebastian.birk/workspace/projects/nichecompass-repro-new/slurm_jobs"
conda_env_name = "nichecompass_hpc"
script_folder_path = "/home/aih/sebastian.birk/workspace/projects/nichecompass-repro-new/scripts"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = " --dataset nanostring_cosmx_human_liver" \
              " --query_batches sample2" \
              " --reference_batch sample1" \
              " --load_timestamp 10032023_145839" \
              " --nichenet_max_n_target_genes_per_gp=20000" \
              " --n_epochs=40" \
              " --n_epochs_all_gps=0" \
              " --lambda_group_lasso=0." \
              " --lambda_l1_masked=0." \
              " --edge_batch_size=256" \
              " --node_batch_size=32"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

## 4. NicheCompass Single Sample Method Benchmarking Models Training

In [3]:
import scanpy as sc

In [13]:
adata = sc.read_h5ad("../datasets/srt_data/results/seqfish_mouse_organogenesis_batch2_nichecompass_one-hop-norm_single_sample_method_benchmarking.h5ad")

In [14]:
adata

AnnData object with n_obs × n_vars = 7656 × 351
    obs: 'cell_type', 'batch'
    uns: 'nichecompass_latent_run10_umap', 'nichecompass_latent_run1_umap', 'nichecompass_latent_run2_umap', 'nichecompass_latent_run3_umap', 'nichecompass_latent_run4_umap', 'nichecompass_latent_run5_umap', 'nichecompass_latent_run6_umap', 'nichecompass_latent_run7_umap', 'nichecompass_latent_run8_umap', 'nichecompass_latent_run9_umap', 'nichecompass_model_training_duration_run1', 'nichecompass_model_training_duration_run10', 'nichecompass_model_training_duration_run2', 'nichecompass_model_training_duration_run3', 'nichecompass_model_training_duration_run4', 'nichecompass_model_training_duration_run5', 'nichecompass_model_training_duration_run6', 'nichecompass_model_training_duration_run7', 'nichecompass_model_training_duration_run8', 'nichecompass_model_training_duration_run9'
    obsm: 'nichecompass_latent_run1', 'nichecompass_latent_run10', 'nichecompass_latent_run10_X_umap', 'nichecompass_latent_run1_X_u

### 4.1 seqFISH Mouse Organogenesis

#### 4.1.1 Spatial Transcriptomics Data

In [51]:
task = "single_sample_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_embryo2"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection encoder gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12227832


#### 4.1.2 Spatial Transcriptomics Data Subsamples

In [53]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct_embryo2"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --condition_key batch" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --cond_embed_injection encoder gene_expr_decoder" \
                   " --n_cond_embed None" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cond_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0."

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 12228081
Submitted batch job 12228082
Submitted batch job 12228083
Submitted batch job 12228084
Submitted batch job 12228085


### 4.2 STARmap PLUS Mouse Central Nervous System

#### 4.2.1 Spatial Transcriptomics Data

In [19]:
task = "single_sample_method_benchmarking"
dataset = "starmap_plus_mouse_cns_batch1"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection encoder gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12084714


In [9]:
32768 * 2

65536

In [17]:
task = "single_sample_method_benchmarking"
dataset = "starmap_plus_mouse_cns_batch1"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "32768" # 16384 is full dataset
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 0" \
               " --run_index 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection encoder gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12234475


In [18]:
task = "single_sample_method_benchmarking"
dataset = "starmap_plus_mouse_cns_batch1"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "32768" # 16384 is full dataset
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 0" \
               " --run_index 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection encoder gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12234479


In [16]:
task = "single_sample_method_benchmarking"
dataset = "starmap_plus_mouse_cns_batch1"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "32768" # 16384 is full dataset
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 3
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 0" \
               " --run_index 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection encoder gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12233880


#### 4.2.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"starmap_plus_mouse_cns_subsample_{subsample_pct}pct_batch1"
    cell_type_key = "Main_molecular_cell_type"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --condition_key batch" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --cond_embed_injection encoder gene_expr_decoder" \
                   " --n_cond_embed None" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cond_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0."

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 4.3 Slide-seqV2 Mouse Hippocampus

#### 4.3.1 Spatial Transcriptomics Data

In [None]:
task = "single_sample_method_benchmarking"
dataset = "slideseqv2_mouse_hippocampus"
cell_type_key = "cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection encoder gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 4.3.2 Spatial Transcriptomics Data Subsamples

In [None]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"slideseqv2_mouse_hippocampus_subsample_{subsample_pct}pct"
    cell_type_key = "cell_type"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --condition_key batch" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --cond_embed_injection encoder gene_expr_decoder" \
                   " --n_cond_embed None" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cond_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0."

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 4.4 Vizgen MERFISH Mouse Liver

#### 4.4.1 Spatial Transcriptomics Data

In [51]:
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection encoder gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12227832


#### 4.4.2 Spatial Transcriptomics Data Subsamples

In [53]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --condition_key batch" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --cond_embed_injection encoder gene_expr_decoder" \
                   " --n_cond_embed None" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cond_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0."

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 12228081
Submitted batch job 12228082
Submitted batch job 12228083
Submitted batch job 12228084
Submitted batch job 12228085


### 4.5 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

#### 4.5.1 Spatial Transcriptomics Data

In [51]:
task = "single_sample_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
cell_type_key = "cell_type"
species = "mouse"
node_label_method = "one-hop-norm" # one-hop-norm, one-hop-attention
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset
lambda_cond_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1." \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection encoder gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 12227832


#### 4.5.2 Spatial Transcriptomics Data Subsamples

In [53]:
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    cell_type_key = "cell_type"
    species = "mouse"
    node_label_method = "one-hop-attention" # one-hop-norm, one-hop-attention
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384 16384 16384" # 16384 is full dataset

    job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None None None" \
                   " --seeds 0 1 2 3 4 5 6 7 8 9" \
                   " --run_index 1 2 3 4 5 6 7 8 9 10" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --condition_key batch" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {node_label_method}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --active_gp_thresh_ratio 0.05" \
                   " --gene_expr_recon_dist nb" \
                   " --cond_embed_injection encoder gene_expr_decoder" \
                   " --n_cond_embed None" \
                   " --log_variational" \
                   f" --node_label_method {node_label_method}" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cond_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 500000." \
                   " --lambda_gene_expr_recon 300." \
                   f" --lambda_cond_contrastive {lambda_cond_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0."

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 12228081
Submitted batch job 12228082
Submitted batch job 12228083
Submitted batch job 12228084
Submitted batch job 12228085


## 5. NicheCompass Sample Integration Method Benchmarking Models Training

### 5.1 seqFISH Mouse Organogenesis

In [None]:
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size_str = "4096 4096 2048 2048 1024 1024 512 512 256 256"

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None None None" \
               " --seeds 0 1 2 3 4 5 6 7 8 9" \
               " --run_index 1 2 3 4 5 6 7 8 9 10" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 0.01" \
               " --nichenet_max_n_target_genes_per_gp 25344" \
               " --include_mebocost_gps" \
               f" --mebocost_species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --condition_key batch" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {node_label_method}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --active_gp_thresh_ratio 0.05" \
               " --gene_expr_recon_dist nb" \
               " --cond_embed_injection gene_expr_decoder" \
               " --n_cond_embed None" \
               " --log_variational" \
               f" --node_label_method {node_label_method}" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 100" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cond_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 500000." \
               " --lambda_gene_expr_recon 300." \
               " --lambda_cond_contrastive 0." \
               " --contrastive_logits_ratio 0." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0."

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
job_name_prefix = "nichecompass_seqfish_mouse_organogenesis_sample_integration_method_benchmarking"
job_id = 1
job_folder_path = "/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/slurm_jobs"
conda_env_name = "nichecompass"
script_folder_path = "/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts"
script_name = "train_nichecompass_benchmarking_models.py"
script_args = " --adata_new_name None " \
              " --n_neighbors_list 4 4 8 8 12 12 16 16 20 20 " \
              " --edge_batch_size_list 512 512 256 256 256 256 128 128 128 128 " \
              " --node_batch_size_list 64 64 32 32 32 32 16 16 16 16 " \
              " --seeds 0 1 2 3 4 5 6 7 8 9 " \
              " --run_index 1 2 3 4 5 6 7 8 9 10 " \
              " --cell_type_key celltype_mapped_refined " \
              " --nichenet_keep_target_genes_ratio 0.01 " \
              " --nichenet_max_n_target_genes_per_gp 25344 " \
              " --include_mebocost_gps " \
              " --mebocost_species mouse " \
              " --gp_filter_mode subset " \
              " --combine_overlap_gps " \
              " --overlap_thresh_source_genes 0.9 " \
              " --overlap_thresh_target_genes 0.9 " \
              " --overlap_thresh_genes 0.9 " \
              " --dataset seqfish_mouse_organogenesis " \
              " --reference_batches batch1 batch2 batch3 batch4 batch5 batch6 " \
              " --counts_key counts " \
              " --condition_key batch " \
              " --spatial_key spatial " \
              " --adj_key spatial_connectivities " \
              " --mapping_entity_key mapping_entity " \
              " --no-filter_genes " \
              " --gp_targets_mask_key nichecompass_gp_targets " \
              " --gp_sources_mask_key nichecompass_gp_sources " \
              " --gp_names_key nichecompass_gp_names " \
              " --model_label sample_integration_method_benchmarking " \
              " --active_gp_names_key nichecompass_active_gp_names " \
              " --latent_key nichecompass_latent " \
              " --active_gp_thresh_ratio 0.03 " \
              " --gene_expr_recon_dist nb " \
              " --cond_embed_injection gene_expr_decoder " \
              " --log_variational " \
              " --n_layers_encoder 1 " \
              " --conv_layer_encoder gcnconv " \
              " --n_epochs 40 " \
              " --n_epochs_all_gps 20 " \
              " --lr 0.001 " \
              " --lambda_edge_recon 10. " \
              " --lambda_gene_expr_recon 0.01 " \
              " --lambda_cond_contrastive 10. " \
              " --contrastive_logits_ratio 0.1 " \
              " --lambda_group_lasso 0. " \
              " --lambda_l1_masked 0. " \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)