# Slurm Job Submission

- **Creator**: Sebastian Birk (<sebastian.birk@helmholtz-munich.de>).
- **Affiliation:** Helmholtz Munich, Institute of Computational Biology (ICB), Talavera-López Lab
- **Date of Creation:** 20.03.2023
- **Date of Last Modification:** 31.08.2023

## 1. Setup

### 1.1 Import Libraries

In [55]:
import numpy as np
import os

### 1.2 Define Parameters

In [56]:
conda_env_name = "nichecompass"
reproducibility_conda_env_name = "nichecompass-reproducibility"
reproducibility_conda_env_name = "nichecompass-test"
conda_env_name = "nichecompass-test"

### 1.3 Define Functions

In [57]:
def submit_python_script(
        job_name_prefix,
        job_id,
        job_folder_path,
        conda_env_name,
        script_folder_path,
        script_name,
        script_args,
        t="48:00:00",
        p="gpu_p",
        gres="gpu:1",
        qos="gpu",
        nice=10000):
    job_name = f"{job_name_prefix}_{job_id}"
    # Account for fact that submit node has different home path than compute node
    job_file_path = f"{job_folder_path.replace('/aih', '')}/job_{job_name}.cmd"
    out_file_path = f"{job_folder_path}/logs/out_{job_name}.txt"
    err_file_path = f"{job_folder_path}/logs/err_{job_name}.txt"
    
    os.makedirs(job_folder_path + "/logs", exist_ok=True)
    
    with open(job_file_path, "w") as handle:
        handle.writelines("#!/bin/bash\n")
        handle.writelines(f"#SBATCH -J {job_name}\n")
        handle.writelines(f"#SBATCH -o {out_file_path}\n")
        handle.writelines(f"#SBATCH -e {err_file_path}\n")
        handle.writelines(f"#SBATCH -t {t}\n")
        handle.writelines(f"#SBATCH -p {p}\n")
        handle.writelines("#SBATCH -c 6\n")
        #handle.writelines("#SBATCH --exclude=supergpu02,supergpu03\n")
        if "gpu" in p:
            handle.writelines(f"#SBATCH --gres={gres}\n")
            handle.writelines(f"#SBATCH --qos={qos}\n")       
        handle.writelines("#SBATCH --mem=128G\n")
        handle.writelines(f"#SBATCH --nice={nice}\n")
        handle.writelines("source $HOME/.bashrc\n")
        handle.writelines(f"conda activate {conda_env_name}\n")
        handle.writelines("cd /\n")
        handle.writelines(f"cd {script_folder_path}\n")
        handle.writelines(f"python ../{script_name}")
        handle.writelines(f"{script_args}")
        handle.writelines("\n")
        
    os.system(f"sbatch {job_file_path}")

## 2. NicheCompass Ablation

### 2.1 Loss Weights & GP Mask

Ablating:
- Loss weights: Gene Expression Reconstruction, Edge Reconstruction, KL Divergence
- Gene program Mask: FC GP Mask, NicheCompass Default GP Mask (Not Filtered, Weakly Filtered, Strongly Filtered)

Additional covariate:
- Spatial Neighborhood Graph: Number of Neighbors

#### 2.1.1 Xenium Human Breast Cancer

##### 2.1.1.1 Fully Connected GP Mask

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [None]:
job_id = 37
for lambda_edge_recon in [5000000, 50000000]:
    for lambda_gene_expr_recon in [3000, 30000]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.1.2 NicheCompass GP Mask

In [None]:
job_id = 53
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [None]:
job_id = 89
for lambda_edge_recon in [5000000, 50000000]:
    for lambda_gene_expr_recon in [3000, 30000]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "xenium_human_breast_cancer"
            reference_batches = "batch1"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

#### 2.1.2 STARmap PLUS Mouse Central Nervous System

##### 2.1.2.1 Fully Connected GP Mask

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [None]:
job_id = 37
for lambda_edge_recon in [5000000, 50000000]:
    for lambda_gene_expr_recon in [3000, 30000]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.2.2 NicheCompass GP Mask

In [None]:
job_id = 53
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [None]:
job_id = 89
for lambda_edge_recon in [5000000, 50000000]:
    for lambda_gene_expr_recon in [3000, 30000]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "starmap_plus_mouse_cns"
            reference_batches = "batch1"
            species = "mouse"
            node_label_method = "one-hop-norm"
            edge_batch_size = 1024

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

#### 2.1.3 Vizgen MERFISH Human Ovarian Cancer

In [None]:
# tmp
job_id = 100006
for lambda_edge_recon in [50000000]:
    for lambda_gene_expr_recon in [3000]:
        for n_neighbors in [16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512
            n_sampled_neighbors = -1

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 0.1" \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 79" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors {n_sampled_neighbors}" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.3.1 Fully Connected GP Mask

In [None]:
job_id = 1
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [None]:
job_id = 37
for lambda_edge_recon in [5000000, 50000000]:
    for lambda_gene_expr_recon in [3000, 30000]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          f" --add_fc_gps_instead_of_gp_dict_gps" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

##### 2.1.3.2 NicheCompass GP Mask

In [None]:
job_id = 53
for lambda_edge_recon in [0, 50000, 500000]:
    for lambda_gene_expr_recon in [0, 30, 300]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

In [None]:
# missing
job_id = 89
for lambda_edge_recon in [5000000, 50000000]:
    for lambda_gene_expr_recon in [3000, 30000]:
        for n_neighbors in [4, 8, 12, 16]:
            task = "ablation"
            ablation_task = "loss_weights"
            dataset = "vizgen_merfish_human_ovarian_cancer"
            reference_batches = "batch2"
            species = "human"
            node_label_method = "one-hop-norm"
            edge_batch_size = 512

            job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
            job_folder_path = f"../scripts/{task}/slurm_jobs"
            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
            script_name = "train_nichecompass_reference_model.py"
            script_args = f" --dataset {dataset}" \
                          f" --reference_batches {reference_batches}" \
                          f" --n_neighbors {n_neighbors}" \
                          " --no-filter_genes" \
                          " --nichenet_keep_target_genes_ratio 1." \
                          " --nichenet_max_n_target_genes_per_gp 250" \
                          " --include_mebocost_gps" \
                          f" --species {species}" \
                          " --gp_filter_mode subset" \
                          " --combine_overlap_gps" \
                          " --overlap_thresh_source_genes 0.9" \
                          " --overlap_thresh_target_genes 0.9" \
                          " --overlap_thresh_genes 0.9" \
                          " --counts_key counts" \
                          " --spatial_key spatial" \
                          " --adj_key spatial_connectivities" \
                          " --mapping_entity_key mapping_entity" \
                          " --gp_targets_mask_key nichecompass_gp_targets" \
                          " --gp_sources_mask_key nichecompass_gp_sources" \
                          " --gp_names_key nichecompass_gp_names" \
                          f" --model_label {ablation_task}_{task}" \
                          " --active_gp_names_key nichecompass_active_gp_names" \
                          " --latent_key nichecompass_latent" \
                          " --active_gp_thresh_ratio 0." \
                          " --gene_expr_recon_dist nb" \
                          " --log_variational" \
                          f" --node_label_method {node_label_method}" \
                          " --n_layers_encoder 1" \
                          " --n_hidden_encoder None" \
                          " --conv_layer_encoder gcnconv" \
                          " --n_epochs 100" \
                          " --n_epochs_all_gps 25" \
                          " --lr 0.001" \
                          f" --lambda_edge_recon {lambda_edge_recon}" \
                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                          " --lambda_group_lasso 0." \
                          f" --lambda_l1_masked 0." \
                          f" --edge_batch_size {edge_batch_size}" \
                          " --node_batch_size None" \
                          f" --n_sampled_neighbors -1" \
                          f" --timestamp_suffix _{job_id}"

            submit_python_script(
                    job_name_prefix=job_name_prefix,
                    job_id=job_id,
                    job_folder_path=job_folder_path,
                    conda_env_name=conda_env_name,
                    script_folder_path=script_folder_path,
                    script_name=script_name,
                    script_args=script_args,
                    nice=10000)

            job_id += 1

#### 2.1.4 Metrics Computation

In [None]:
# tmp
job_id = 1
for i in np.arange(60, 65, 5):
    task = "ablation"
    dataset = "starmap_plus_mouse_cns"
    ablation_task = "loss_weights"
    cell_type_keys = "Main_molecular_cell_type"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
# tmp
job_id = 1
for i in np.arange(5, 70, 5):
    task = "one-hop-norm_reference"
    dataset = "seqfish_mouse_organogenesis_imputed"
    ablation_task = "loss_weights"
    cell_type_keys = "celltype_mapped_refined"
    batch_keys = "batch"
    file_name = f"mlflow_summary_loss_weights_ablation_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_loss_weights_ablation_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/ablation/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/ablation"
    script_name = "compute_metrics.py"
    script_args = f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
job_id = 1
for i in np.arange(5, 45, 5):
    task = "ablation"
    dataset = "xenium_human_breast_cancer"
    ablation_task = "loss_weights"
    cell_type_keys = "cell_states"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
# tmp
job_id = 1
for i in np.arange(45, 65, 5):
    task = "ablation"
    dataset = "vizgen_merfish_human_ovarian_cancer"
    ablation_task = "loss_weights"
    cell_type_keys = "cell_type"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
job_id = 1
for i in np.arange(5, 45, 5):
    task = "ablation"
    dataset = "starmap_plus_mouse_cns"
    ablation_task = "loss_weights"
    cell_type_keys = "Main_molecular_cell_type"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
job_id = 1
for i in np.arange(5, 45, 5):
    task = "ablation"
    dataset = "vizgen_merfish_human_ovarian_cancer"
    ablation_task = "loss_weights"
    cell_type_keys = "cell_type"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
job_id = 1
for i in np.arange(20, 240, 20):
    task = "ablation"
    dataset = "vizgen_merfish_human_ovarian_cancer"
    ablation_task = "loss_weights"
    cell_type_keys = "cell_type"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

### 2.2 Neighbor Sampling

Best parameters from previous ablations:
- lambda_edge_recon = 50,000,000
- lambda_gene_expr_recon = 30,000
- nichenet_keep_target_genes_ratio = 1.0

Ablating:
- Neighbor sampling: no sampling, 2 neighbors sampling, 4 neighbors sampling, 8 neighbors sampling

#### 2.2.1 Xenium Human Breast Cancer

In [None]:
job_id = 1
for n_neighbors in [4, 8, 12, 16]:
    for n_sampled_neighbors in [2, 4, 8]:
        task = "ablation"
        ablation_task = "neighbor_sampling"
        dataset = "xenium_human_breast_cancer"
        reference_batches = "batch1"
        species = "human"
        node_label_method = "one-hop-norm"
        edge_batch_size = 128
        lambda_edge_recon = 500000
        lambda_gene_expr_recon = 300

        job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
        job_folder_path = f"../scripts/{task}/slurm_jobs"
        script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
        script_name = "train_nichecompass_reference_model.py"
        script_args = f" --dataset {dataset}" \
                      f" --reference_batches {reference_batches}" \
                      f" --n_neighbors {n_neighbors}" \
                      " --no-filter_genes" \
                      " --nichenet_keep_target_genes_ratio 1.0" \
                      " --nichenet_max_n_target_genes_per_gp 250" \
                      " --include_mebocost_gps" \
                      f" --species {species}" \
                      " --gp_filter_mode subset" \
                      " --combine_overlap_gps" \
                      " --overlap_thresh_source_genes 0.9" \
                      " --overlap_thresh_target_genes 0.9" \
                      " --overlap_thresh_genes 0.9" \
                      " --counts_key counts" \
                      " --spatial_key spatial" \
                      " --adj_key spatial_connectivities" \
                      " --mapping_entity_key mapping_entity" \
                      " --gp_targets_mask_key nichecompass_gp_targets" \
                      " --gp_sources_mask_key nichecompass_gp_sources" \
                      " --gp_names_key nichecompass_gp_names" \
                      f" --model_label {ablation_task}_{task}" \
                      " --active_gp_names_key nichecompass_active_gp_names" \
                      " --latent_key nichecompass_latent" \
                      " --active_gp_thresh_ratio 0." \
                      " --gene_expr_recon_dist nb" \
                      " --log_variational" \
                      f" --node_label_method {node_label_method}" \
                      " --n_layers_encoder 1" \
                      " --n_hidden_encoder None" \
                      " --conv_layer_encoder gcnconv" \
                      " --n_epochs 100" \
                      " --n_epochs_all_gps 25" \
                      " --lr 0.001" \
                      f" --lambda_edge_recon {lambda_edge_recon}" \
                      f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                      " --lambda_group_lasso 0." \
                      f" --lambda_l1_masked 0." \
                      f" --edge_batch_size {edge_batch_size}" \
                      " --node_batch_size None" \
                      f" --n_sampled_neighbors {n_sampled_neighbors}" \
                      f" --timestamp_suffix _{job_id}"

        submit_python_script(
                job_name_prefix=job_name_prefix,
                job_id=job_id,
                job_folder_path=job_folder_path,
                conda_env_name=conda_env_name,
                script_folder_path=script_folder_path,
                script_name=script_name,
                script_args=script_args,
                nice=10000)

        job_id += 1

In [None]:
# tmp
job_id = 15
for n_neighbors in [20]:
    for n_sampled_neighbors in [8]:
        task = "ablation"
        ablation_task = "neighbor_sampling"
        dataset = "xenium_human_breast_cancer"
        reference_batches = "batch1"
        species = "human"
        node_label_method = "one-hop-norm"
        edge_batch_size = 256
        lambda_edge_recon = 500000
        lambda_gene_expr_recon = 300

        job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
        job_folder_path = f"../scripts/{task}/slurm_jobs"
        script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
        script_name = "train_nichecompass_reference_model.py"
        script_args = f" --dataset {dataset}" \
                      f" --reference_batches {reference_batches}" \
                      f" --n_neighbors {n_neighbors}" \
                      " --no-filter_genes" \
                      " --nichenet_keep_target_genes_ratio 1.0" \
                      " --nichenet_max_n_target_genes_per_gp 250" \
                      " --include_mebocost_gps" \
                      f" --species {species}" \
                      " --gp_filter_mode subset" \
                      " --combine_overlap_gps" \
                      " --overlap_thresh_source_genes 0.9" \
                      " --overlap_thresh_target_genes 0.9" \
                      " --overlap_thresh_genes 0.9" \
                      " --counts_key counts" \
                      " --spatial_key spatial" \
                      " --adj_key spatial_connectivities" \
                      " --mapping_entity_key mapping_entity" \
                      " --gp_targets_mask_key nichecompass_gp_targets" \
                      " --gp_sources_mask_key nichecompass_gp_sources" \
                      " --gp_names_key nichecompass_gp_names" \
                      f" --model_label {ablation_task}_{task}" \
                      " --active_gp_names_key nichecompass_active_gp_names" \
                      " --latent_key nichecompass_latent" \
                      " --active_gp_thresh_ratio 0." \
                      " --gene_expr_recon_dist nb" \
                      " --log_variational" \
                      f" --node_label_method {node_label_method}" \
                      " --n_layers_encoder 1" \
                      " --n_hidden_encoder None" \
                      " --conv_layer_encoder gcnconv" \
                      " --n_epochs 100" \
                      " --n_epochs_all_gps 25" \
                      " --lr 0.001" \
                      f" --lambda_edge_recon {lambda_edge_recon}" \
                      f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                      " --lambda_group_lasso 0." \
                      f" --lambda_l1_masked 0." \
                      f" --edge_batch_size {edge_batch_size}" \
                      " --node_batch_size None" \
                      f" --n_sampled_neighbors {n_sampled_neighbors}" \
                      f" --timestamp_suffix _{job_id}"

        submit_python_script(
                job_name_prefix=job_name_prefix,
                job_id=job_id,
                job_folder_path=job_folder_path,
                conda_env_name=conda_env_name,
                script_folder_path=script_folder_path,
                script_name=script_name,
                script_args=script_args,
                nice=10000)

        job_id += 1

#### 2.2.2 STARmap PLUS Mouse Central Nervous System

In [None]:
job_id = 1
for n_neighbors in [4, 8, 12, 16]:
    for n_sampled_neighbors in [2, 4, 8]:
        task = "ablation"
        ablation_task = "neighbor_sampling"
        dataset = "starmap_plus_mouse_cns"
        reference_batches = "batch1"
        species = "mouse"
        node_label_method = "one-hop-norm"
        edge_batch_size = 1024
        lambda_edge_recon = 500000
        lambda_gene_expr_recon = 300

        job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
        job_folder_path = f"../scripts/{task}/slurm_jobs"
        script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
        script_name = "train_nichecompass_reference_model.py"
        script_args = f" --dataset {dataset}" \
                      f" --reference_batches {reference_batches}" \
                      f" --n_neighbors {n_neighbors}" \
                      " --no-filter_genes" \
                      " --nichenet_keep_target_genes_ratio 1.0" \
                      " --nichenet_max_n_target_genes_per_gp 250" \
                      " --include_mebocost_gps" \
                      f" --species {species}" \
                      " --gp_filter_mode subset" \
                      " --combine_overlap_gps" \
                      " --overlap_thresh_source_genes 0.9" \
                      " --overlap_thresh_target_genes 0.9" \
                      " --overlap_thresh_genes 0.9" \
                      " --counts_key counts" \
                      " --spatial_key spatial" \
                      " --adj_key spatial_connectivities" \
                      " --mapping_entity_key mapping_entity" \
                      " --gp_targets_mask_key nichecompass_gp_targets" \
                      " --gp_sources_mask_key nichecompass_gp_sources" \
                      " --gp_names_key nichecompass_gp_names" \
                      f" --model_label {ablation_task}_{task}" \
                      " --active_gp_names_key nichecompass_active_gp_names" \
                      " --latent_key nichecompass_latent" \
                      " --active_gp_thresh_ratio 0." \
                      " --gene_expr_recon_dist nb" \
                      " --log_variational" \
                      f" --node_label_method {node_label_method}" \
                      " --n_layers_encoder 1" \
                      " --n_hidden_encoder None" \
                      " --conv_layer_encoder gcnconv" \
                      " --n_epochs 100" \
                      " --n_epochs_all_gps 25" \
                      " --lr 0.001" \
                      f" --lambda_edge_recon {lambda_edge_recon}" \
                      f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                      " --lambda_group_lasso 0." \
                      f" --lambda_l1_masked 0." \
                      f" --edge_batch_size {edge_batch_size}" \
                      " --node_batch_size None" \
                      f" --n_sampled_neighbors {n_sampled_neighbors}" \
                      f" --timestamp_suffix _{job_id}"

        submit_python_script(
                job_name_prefix=job_name_prefix,
                job_id=job_id,
                job_folder_path=job_folder_path,
                conda_env_name=conda_env_name,
                script_folder_path=script_folder_path,
                script_name=script_name,
                script_args=script_args,
                nice=10000)

        job_id += 1

In [None]:
job_id = 13
for n_neighbors in [20, 24, 28, 32]:
    for n_sampled_neighbors in [2, 4, 8]:
        task = "ablation"
        ablation_task = "neighbor_sampling"
        dataset = "starmap_plus_mouse_cns"
        reference_batches = "batch1"
        species = "mouse"
        node_label_method = "one-hop-norm"
        edge_batch_size = 512
        lambda_edge_recon = 500000
        lambda_gene_expr_recon = 300

        job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
        job_folder_path = f"../scripts/{task}/slurm_jobs"
        script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
        script_name = "train_nichecompass_reference_model.py"
        script_args = f" --dataset {dataset}" \
                      f" --reference_batches {reference_batches}" \
                      f" --n_neighbors {n_neighbors}" \
                      " --no-filter_genes" \
                      " --nichenet_keep_target_genes_ratio 1.0" \
                      " --nichenet_max_n_target_genes_per_gp 250" \
                      " --include_mebocost_gps" \
                      f" --species {species}" \
                      " --gp_filter_mode subset" \
                      " --combine_overlap_gps" \
                      " --overlap_thresh_source_genes 0.9" \
                      " --overlap_thresh_target_genes 0.9" \
                      " --overlap_thresh_genes 0.9" \
                      " --counts_key counts" \
                      " --spatial_key spatial" \
                      " --adj_key spatial_connectivities" \
                      " --mapping_entity_key mapping_entity" \
                      " --gp_targets_mask_key nichecompass_gp_targets" \
                      " --gp_sources_mask_key nichecompass_gp_sources" \
                      " --gp_names_key nichecompass_gp_names" \
                      f" --model_label {ablation_task}_{task}" \
                      " --active_gp_names_key nichecompass_active_gp_names" \
                      " --latent_key nichecompass_latent" \
                      " --active_gp_thresh_ratio 0." \
                      " --gene_expr_recon_dist nb" \
                      " --log_variational" \
                      f" --node_label_method {node_label_method}" \
                      " --n_layers_encoder 1" \
                      " --n_hidden_encoder None" \
                      " --conv_layer_encoder gcnconv" \
                      " --n_epochs 100" \
                      " --n_epochs_all_gps 25" \
                      " --lr 0.001" \
                      f" --lambda_edge_recon {lambda_edge_recon}" \
                      f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                      " --lambda_group_lasso 0." \
                      f" --lambda_l1_masked 0." \
                      f" --edge_batch_size {edge_batch_size}" \
                      " --node_batch_size None" \
                      f" --n_sampled_neighbors {n_sampled_neighbors}" \
                      f" --timestamp_suffix _{job_id}"

        submit_python_script(
                job_name_prefix=job_name_prefix,
                job_id=job_id,
                job_folder_path=job_folder_path,
                conda_env_name=conda_env_name,
                script_folder_path=script_folder_path,
                script_name=script_name,
                script_args=script_args,
                nice=10000)

        job_id += 1

#### 2.2.3 Vizgen MERFISH Human Ovarian Cancer

In [None]:
job_id = 1
for n_neighbors in [4, 8, 12, 16]:
    for n_sampled_neighbors in [2, 4, 8]:
        task = "ablation"
        ablation_task = "neighbor_sampling"
        dataset = "vizgen_merfish_human_ovarian_cancer"
        reference_batches = "batch2"
        species = "human"
        node_label_method = "one-hop-norm"
        edge_batch_size = 512
        lambda_edge_recon = 500000
        lambda_gene_expr_recon = 300

        job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
        job_folder_path = f"../scripts/{task}/slurm_jobs"
        script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
        script_name = "train_nichecompass_reference_model.py"
        script_args = f" --dataset {dataset}" \
                      f" --reference_batches {reference_batches}" \
                      f" --n_neighbors {n_neighbors}" \
                      " --no-filter_genes" \
                      " --nichenet_keep_target_genes_ratio 1.0" \
                      " --nichenet_max_n_target_genes_per_gp 250" \
                      " --include_mebocost_gps" \
                      f" --species {species}" \
                      " --gp_filter_mode subset" \
                      " --combine_overlap_gps" \
                      " --overlap_thresh_source_genes 0.9" \
                      " --overlap_thresh_target_genes 0.9" \
                      " --overlap_thresh_genes 0.9" \
                      " --counts_key counts" \
                      " --spatial_key spatial" \
                      " --adj_key spatial_connectivities" \
                      " --mapping_entity_key mapping_entity" \
                      " --gp_targets_mask_key nichecompass_gp_targets" \
                      " --gp_sources_mask_key nichecompass_gp_sources" \
                      " --gp_names_key nichecompass_gp_names" \
                      f" --model_label {ablation_task}_{task}" \
                      " --active_gp_names_key nichecompass_active_gp_names" \
                      " --latent_key nichecompass_latent" \
                      " --active_gp_thresh_ratio 0." \
                      " --gene_expr_recon_dist nb" \
                      " --log_variational" \
                      f" --node_label_method {node_label_method}" \
                      " --n_layers_encoder 1" \
                      " --n_hidden_encoder None" \
                      " --conv_layer_encoder gcnconv" \
                      " --n_epochs 100" \
                      " --n_epochs_all_gps 25" \
                      " --lr 0.001" \
                      f" --lambda_edge_recon {lambda_edge_recon}" \
                      f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                      " --lambda_group_lasso 0." \
                      f" --lambda_l1_masked 0." \
                      f" --edge_batch_size {edge_batch_size}" \
                      " --node_batch_size None" \
                      f" --n_sampled_neighbors {n_sampled_neighbors}" \
                      f" --timestamp_suffix _{job_id}"

        submit_python_script(
                job_name_prefix=job_name_prefix,
                job_id=job_id,
                job_folder_path=job_folder_path,
                conda_env_name=conda_env_name,
                script_folder_path=script_folder_path,
                script_name=script_name,
                script_args=script_args,
                nice=10000)

        job_id += 1

In [None]:
job_id = 1
for n_neighbors in [20, 24, 28, 32]:
    for n_sampled_neighbors in [2, 4, 8]:
        task = "ablation"
        ablation_task = "neighbor_sampling"
        dataset = "vizgen_merfish_human_ovarian_cancer"
        reference_batches = "batch2"
        species = "human"
        node_label_method = "one-hop-norm"
        edge_batch_size = 256
        lambda_edge_recon = 
        lambda_gene_expr_recon =
        nichenet_keep_target_genes_ratio = 1.0

        job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
        job_folder_path = f"../scripts/{task}/slurm_jobs"
        script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
        script_name = "train_nichecompass_reference_model.py"
        script_args = f" --dataset {dataset}" \
                      f" --reference_batches {reference_batches}" \
                      f" --n_neighbors {n_neighbors}" \
                      " --no-filter_genes" \
                      f" --nichenet_keep_target_genes_ratio {nichenet_keep_target_genes_ratio}" \
                      " --nichenet_max_n_target_genes_per_gp 250" \
                      " --include_mebocost_gps" \
                      f" --species {species}" \
                      " --gp_filter_mode subset" \
                      " --combine_overlap_gps" \
                      " --overlap_thresh_source_genes 0.9" \
                      " --overlap_thresh_target_genes 0.9" \
                      " --overlap_thresh_genes 0.9" \
                      " --counts_key counts" \
                      " --spatial_key spatial" \
                      " --adj_key spatial_connectivities" \
                      " --mapping_entity_key mapping_entity" \
                      " --gp_targets_mask_key nichecompass_gp_targets" \
                      " --gp_sources_mask_key nichecompass_gp_sources" \
                      " --gp_names_key nichecompass_gp_names" \
                      f" --model_label {ablation_task}_{task}" \
                      " --active_gp_names_key nichecompass_active_gp_names" \
                      " --latent_key nichecompass_latent" \
                      " --active_gp_thresh_ratio 0." \
                      " --gene_expr_recon_dist nb" \
                      " --log_variational" \
                      f" --node_label_method {node_label_method}" \
                      " --n_layers_encoder 1" \
                      " --n_hidden_encoder None" \
                      " --conv_layer_encoder gcnconv" \
                      " --n_epochs 100" \
                      " --n_epochs_all_gps 25" \
                      " --lr 0.001" \
                      f" --lambda_edge_recon {lambda_edge_recon}" \
                      f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                      " --lambda_group_lasso 0." \
                      f" --lambda_l1_masked 0." \
                      f" --edge_batch_size {edge_batch_size}" \
                      " --node_batch_size None" \
                      f" --n_sampled_neighbors {n_sampled_neighbors}" \
                      f" --timestamp_suffix _{job_id}"

        submit_python_script(
                job_name_prefix=job_name_prefix,
                job_id=job_id,
                job_folder_path=job_folder_path,
                conda_env_name=conda_env_name,
                script_folder_path=script_folder_path,
                script_name=script_name,
                script_args=script_args,
                nice=10000)

        job_id += 1

#### 2.2.4 Metrics Computation

In [None]:
job_id = 1
for i in np.arange(2, 22, 2):
    task = "ablation"
    dataset = "xenium_human_breast_cancer"
    ablation_task = "neighbor_sampling"
    cell_type_keys = "cell_states"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
job_id = 1
for i in np.arange(2, 14, 2):
    task = "ablation"
    dataset = "starmap_plus_mouse_cns"
    ablation_task = "neighbor_sampling"
    cell_type_keys = "Main_molecular_cell_type"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
job_id = 1
for i in np.arange(2, 14, 2):
    task = "ablation"
    dataset = "vizgen_merfish_human_ovarian_cancer"
    ablation_task = "neighbor_sampling"
    cell_type_keys = "cell_type"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

### 2.3 Encoder Architecture

Ablating:
- Encoder Number of Message Passing Layers: 1, 2
- Encoder Type of Message Passing Layers: GCNConv vs GATv2Conv (4 attention heads)

#### 2.3.1 Xenium Human Breast Cancer

In [None]:
job_id = 1
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gcnconv", "gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1, 2]:
                for n_neighbors in [4, 8, 12, 16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "xenium_human_breast_cancer"
                    reference_batches = "batch1"
                    species = "human"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 128
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  " --node_batch_size None" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

In [None]:
job_id = 33
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1, 2]:
                for n_neighbors in [4, 8, 12, 16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "xenium_human_breast_cancer"
                    reference_batches = "batch1"
                    species = "human"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 128
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  " --node_batch_size None" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

In [None]:
# tmp
job_id = 8
for n_layers_encoder in [1]:
    for conv_layer_encoder in ["gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1]:
                for n_neighbors in [16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "xenium_human_breast_cancer"
                    reference_batches = "batch1"
                    species = "human"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 128
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  " --node_batch_size None" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=0)

                    job_id += 1

#### 2.3.2 STARmap PLUS Mouse Central Nervous System

In [None]:
job_id = 1
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gcnconv", "gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1, 2]:
                for n_neighbors in [4, 8, 12, 16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "starmap_plus_mouse_cns"
                    reference_batches = "batch1"
                    species = "mouse"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 256
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  " --node_batch_size None" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

In [None]:
job_id = 33
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1, 2]:
                for n_neighbors in [4, 8, 12, 16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "starmap_plus_mouse_cns"
                    reference_batches = "batch1"
                    species = "mouse"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 256
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  " --node_batch_size None" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

In [None]:
# tmp
job_id = 28
for n_layers_encoder in [2]:
    for conv_layer_encoder in ["gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1]:
                for n_neighbors in [16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "starmap_plus_mouse_cns"
                    reference_batches = "batch1"
                    species = "mouse"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 128
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  " --node_batch_size None" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

#### 2.3.3 Vizgen MERFISH Human Ovarian Cancer

In [None]:
job_id = 1
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gcnconv", "gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1, 2]:
                for n_neighbors in [4, 8, 12, 16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "vizgen_merfish_human_ovarian_cancer"
                    reference_batches = "batch2"
                    species = "human"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 128
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  " --node_batch_size None" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

In [None]:
job_id = 33
for n_layers_encoder in [1, 2]:
    for conv_layer_encoder in ["gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1, 2]:
                for n_neighbors in [4, 8, 12, 16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "vizgen_merfish_human_ovarian_cancer"
                    reference_batches = "batch2"
                    species = "human"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 128
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  " --node_batch_size None" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

In [None]:
# tmp
job_id = 11
for n_layers_encoder in [1]:
    for conv_layer_encoder in ["gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1]:
                for n_neighbors in [12]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "vizgen_merfish_human_ovarian_cancer"
                    reference_batches = "batch2"
                    species = "human"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 256
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 50" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  " --node_batch_size None" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

In [None]:
# tmp
job_id = 27
for n_layers_encoder in [2]:
    for conv_layer_encoder in ["gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_fc_layers_encoder in [1]:
                for n_neighbors in [12]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "vizgen_merfish_human_ovarian_cancer"
                    reference_batches = "batch2"
                    species = "human"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 256
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio 0." \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 50" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso 0." \
                                  f" --lambda_l1_masked 0." \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  " --node_batch_size None" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

#### 2.3.4 Metrics Computation

In [None]:
job_id = 1
for i in np.arange(4, 36, 4):
    task = "ablation"
    dataset = "xenium_human_breast_cancer"
    ablation_task = "encoder_architecture"
    cell_type_keys = "cell_states"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

In [None]:
job_id = 1
for i in np.arange(4, 36, 4):
    task = "ablation"
    dataset = "starmap_plus_mouse_cns"
    ablation_task = "encoder_architecture"
    cell_type_keys = "Main_molecular_cell_type"
    batch_keys = "None"
    file_name = f"mlflow_summary_{ablation_task}_{task}_{dataset}_{i}.csv"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_{ablation_task}_{task}_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_metrics.py"
    script_args = f" --task {ablation_task}_{task}" \
                  f" --file_name {file_name}" \
                  f" --datasets {dataset}" \
                  f" --cell_type_keys {cell_type_keys}" \
                  f" --batch_keys {batch_keys}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)
    
    job_id += 1

### 2.3 Gene Program & Gene Selection

Ablating:
- Gene Program Selection: Active GP Threshold Ratio, Group Lasso Regularization
- Gene Selection: L1 Regularization
- Spatial Neighborhood Graph: Number of Neighbors

#### 2.3.1 Xenium Human Breast Cancer

In [None]:
job_id = 1
for active_gp_thresh_ratio in [0., 0.03, 0.05, 0.1]:
    for lambda_group_lasso in [0., 10.]:
        for lambda_l1_masked in [0., 10.]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "feature_selection"
                dataset = "xenium_human_breast_cancer"
                reference_batches = "batch1"
                species = "human"
                node_label_method = "one-hop-norm"
                edge_batch_size = 128
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.
                n_sampled_neighbors = 4
                n_fc_layers_encoder = 1
                n_layers_encoder = 1
                n_hidden_encoder = "None"
                conv_layer_encoder = "gcnconv"
                
                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1.0" \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                              f" --n_layers_encoder {n_layers_encoder}" \
                              f" --n_hidden_encoder {n_hidden_encoder}" \
                              f" --conv_layer_encoder {conv_layer_encoder}" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked {lambda_l1_masked}" \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --n_sampled_neighbors {n_sampled_neighbors}" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

#### 2.3.2 STARmap PLUS Mouse Central Nervous System

In [None]:
job_id = 1
for active_gp_thresh_ratio in [0., 0.03, 0.05, 0.1]:
    for lambda_group_lasso in [0., 10.]:
        for lambda_l1_masked in [0., 10.]:
            for n_neighbors in [4, 8, 12, 16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "starmap_plus_mouse_cns"
                    reference_batches = "batch1"
                    species = "mouse"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 256
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4
                    n_fc_layers_encoder = 1
                    n_layers_encoder = 1
                    n_hidden_encoder = "None"
                    conv_layer_encoder = "gcnconv"

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso {lambda_group_lasso}" \
                                  f" --lambda_l1_masked {lambda_l1_masked}" \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  " --node_batch_size None" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

#### 2.3.3 Vizgen MERFISH Human Ovarian Cancer

In [None]:
job_id = 1
for active_gp_thresh_ratio in [0., 0.03, 0.05, 0.1]:
    for lambda_group_lasso in [0., 10.]:
        for lambda_l1_masked in [0., 10.]:
            for n_neighbors in [4, 8, 12, 16]:
                    task = "ablation"
                    ablation_task = "encoder_architecture"
                    dataset = "vizgen_merfish_human_ovarian_cancer"
                    reference_batches = "batch2"
                    species = "human"
                    node_label_method = "one-hop-norm"
                    edge_batch_size = 128
                    lambda_edge_recon = 500000.
                    lambda_gene_expr_recon = 300.
                    n_sampled_neighbors = 4
                    n_fc_layers_encoder = 1
                    n_layers_encoder = 1
                    n_hidden_encoder = "None"
                    conv_layer_encoder = "gcnconv"

                    job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                    job_folder_path = f"../scripts/{task}/slurm_jobs"
                    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                    script_name = "train_nichecompass_reference_model.py"
                    script_args = f" --dataset {dataset}" \
                                  f" --reference_batches {reference_batches}" \
                                  f" --n_neighbors {n_neighbors}" \
                                  " --no-filter_genes" \
                                  " --nichenet_keep_target_genes_ratio 1.0" \
                                  " --nichenet_max_n_target_genes_per_gp 250" \
                                  " --include_mebocost_gps" \
                                  f" --species {species}" \
                                  " --gp_filter_mode subset" \
                                  " --combine_overlap_gps" \
                                  " --overlap_thresh_source_genes 0.9" \
                                  " --overlap_thresh_target_genes 0.9" \
                                  " --overlap_thresh_genes 0.9" \
                                  " --counts_key counts" \
                                  " --spatial_key spatial" \
                                  " --adj_key spatial_connectivities" \
                                  " --mapping_entity_key mapping_entity" \
                                  " --gp_targets_mask_key nichecompass_gp_targets" \
                                  " --gp_sources_mask_key nichecompass_gp_sources" \
                                  " --gp_names_key nichecompass_gp_names" \
                                  f" --model_label {ablation_task}_{task}" \
                                  " --active_gp_names_key nichecompass_active_gp_names" \
                                  " --latent_key nichecompass_latent" \
                                  f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                                  " --gene_expr_recon_dist nb" \
                                  " --log_variational" \
                                  f" --node_label_method {node_label_method}" \
                                  f" --n_fc_layers_encoder {n_fc_layers_encoder}" \
                                  f" --n_layers_encoder {n_layers_encoder}" \
                                  f" --n_hidden_encoder {n_hidden_encoder}" \
                                  f" --conv_layer_encoder {conv_layer_encoder}" \
                                  " --n_epochs 100" \
                                  " --n_epochs_all_gps 25" \
                                  " --lr 0.001" \
                                  f" --lambda_edge_recon {lambda_edge_recon}" \
                                  f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                  f" --lambda_group_lasso {lambda_group_lasso}" \
                                  f" --lambda_l1_masked {lambda_l1_masked}" \
                                  f" --edge_batch_size {edge_batch_size}" \
                                  " --node_batch_size None" \
                                  f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                  f" --timestamp_suffix _{job_id}"

                    submit_python_script(
                            job_name_prefix=job_name_prefix,
                            job_id=job_id,
                            job_folder_path=job_folder_path,
                            conda_env_name=conda_env_name,
                            script_folder_path=script_folder_path,
                            script_name=script_name,
                            script_args=script_args,
                            nice=10000)

                    job_id += 1

### 2.5 Categorical Covariate Embedding & Contrastive Integration

#### 2.5.1 Xenium Human Breast Cancer

In [None]:
job_id = 1
for cat_covariates_embeds_injection in ["encoder", "gene_expr_decoder"]:
for lambda_cat_covariates_contrastive in [0., 1, 2]:
    for contrastive_logits_pos_ratio in ["gcnconv", "gatv2conv"]:
        for n_hidden_encoder in [None]:
            for n_neighbors in [4, 8, 12, 16]:
                task = "ablation"
                ablation_task = "encoder_architecture"
                dataset = "xenium_human_breast_cancer"
                reference_batches = "batch1"
                species = "human"
                node_label_method = "one-hop-norm"
                edge_batch_size = 512
                lambda_edge_recon = 500000.
                lambda_gene_expr_recon = 300.
                active_gp_thresh_ratio = 0.05
                lambda_group_lasso = 0.
                n_layers_encoder = 1
                n_hidden_encoder = None
                conv_layer_encoder = "gcnconv"
                
                job_name_prefix = f"{dataset}_nichecompass_{ablation_task}_{task}"
                job_folder_path = f"../scripts/{task}/slurm_jobs"
                script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                script_name = "train_nichecompass_reference_model.py"
                script_args = f" --dataset {dataset}" \
                              f" --reference_batches {reference_batches}" \
                              f" --n_neighbors {n_neighbors}" \
                              " --no-filter_genes" \
                              " --nichenet_keep_target_genes_ratio 1." \
                              " --nichenet_max_n_target_genes_per_gp 250" \
                              " --include_mebocost_gps" \
                              f" --species {species}" \
                              " --gp_filter_mode subset" \
                              " --combine_overlap_gps" \
                              " --overlap_thresh_source_genes 0.9" \
                              " --overlap_thresh_target_genes 0.9" \
                              " --overlap_thresh_genes 0.9" \
                              " --counts_key counts" \
                              f" --cat_covariates_keys {cat_covariates_keys}" \
                              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                              " --spatial_key spatial" \
                              " --adj_key spatial_connectivities" \
                              " --mapping_entity_key mapping_entity" \
                              " --gp_targets_mask_key nichecompass_gp_targets" \
                              " --gp_sources_mask_key nichecompass_gp_sources" \
                              " --gp_names_key nichecompass_gp_names" \
                              f" --model_label {ablation_task}_{task}" \
                              " --active_gp_names_key nichecompass_active_gp_names" \
                              " --latent_key nichecompass_latent" \
                              f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                              " --gene_expr_recon_dist nb" \
                              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                              " --log_variational" \
                              f" --node_label_method {node_label_method}" \
                              f" --n_layers_encoder {n_layers_encoder}" \
                              f" --n_hidden_encoder {n_hidden_encoder}" \
                              f" --conv_layer_encoder {conv_layer_encoder}" \
                              " --n_epochs 100" \
                              " --n_epochs_all_gps 25" \
                              f" --n_epochs_no_cat_covariates_contrastive {n_epochs_no_cat_covariates_contrastive}" \
                              " --lr 0.001" \
                              f" --lambda_edge_recon {lambda_edge_recon}" \
                              f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                              f" --lambda_group_lasso {lambda_group_lasso}" \
                              f" --lambda_l1_masked {lambda_l1_masked}" \
                              f" --edge_batch_size {edge_batch_size}" \
                              " --node_batch_size None" \
                              f" --timestamp_suffix _{job_id}"

                submit_python_script(
                        job_name_prefix=job_name_prefix,
                        job_id=job_id,
                        job_folder_path=job_folder_path,
                        conda_env_name=conda_env_name,
                        script_folder_path=script_folder_path,
                        script_name=script_name,
                        script_args=script_args,
                        nice=10000)

                job_id += 1

## 3. NicheCompass Single Sample Method Benchmarking

### 3.1 seqFISH Mouse Organogenesis

#### 3.1.1 Spatial Transcriptomics Data

In [None]:
# GCN encoder
task = "single_sample_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_embryo2"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "131072 131072 131072 131072 131072 131072 131072 131072" # full dataset
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder
task = "single_sample_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_embryo2"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "2048 2048 2048 2048 2048 2048 2048 2048" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.1.2 Spatial Transcriptomics Data Subsamples

In [None]:
# GCN encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct_embryo2"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "131072 131072 131072 131072 131072 131072 131072 131072 131072 131072" # full dataset
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct_embryo2"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "2048 2048 2048 2048 2048 2048 2048 2048 2048 2048" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.2 seqFISH Mouse Organogenesis Imputed

This is not used for the manuscript.

#### 3.2.1 Spatial Transcriptomics Data

In [None]:
# GCN encoder
task = "single_sample_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_imputed_embryo2"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "8192 8192 8192 8192 8192 8192 8192 8192" # out of memory
n_svg = 5000

job_name_prefix = f"{dataset}_nichecompass_gcnconv_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --filter_genes" \
               " --n_hvg 0" \
               f" --n_svg {n_svg}" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label gcnconv_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               " --conv_layer_encoder gcnconv" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.2.2 Spatial Transcriptomics Data Subsamples

In [None]:
# GCN encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_imputed_subsample_{subsample_pct}pct_embryo2"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "8192 8192 8192 8192 8192 8192 8192 8192" # out of memory
    n_svg = 3000

    job_name_prefix = f"{dataset}_nichecompass_one-hop-norm_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --filter_genes" \
                   " --n_hvg 0" \
                   f" --n_svg {n_svg}" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label one-hop-norm_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 0" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   " --conv_layer_encoder gcnconv" \
                   " --n_epochs 100" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.3 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

#### 3.3.1 Spatial Transcriptomics Data

In [None]:
# GCN encoder
task = "single_sample_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc_batch5"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "32768 32768 32768 32768 32768 32768 32768 32768" # out of memory
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
task = "single_sample_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc_batch5"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "512 512 512 512" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None" \
               " --seeds 7 6 5 4" \
               " --run_index 8 7 6 5" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
task = "single_sample_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc_batch5"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "512 512 512 512" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None" \
               " --seeds 3 2 1 0" \
               " --run_index 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.3.2 Spatial Transcriptomics Data Subsamples

In [None]:
# GCN encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct_batch5"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "32768 32768 32768 32768 32768 32768 32768 32768" # out of memory
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# tmp
for subsample_pct in [5, 10]:
    task = "single_sample_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct_batch5"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "32768 32768 32768 32768 32768 32768 32768 32768" # out of memory
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct_batch5"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "512 512 512 512 512 512 512 512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# tmp
# GATv2 encoder
for subsample_pct in [25]:
    task = "single_sample_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct_batch5"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "512 512 512 512 512 512 512 512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                    f" --model_label {conv_layer_encoder}_{task}" \
                  " --gp_names_key nichecompass_gp_names" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.4 Vizgen MERFISH Mouse Liver

#### 3.4.1 Spatial Transcriptomics Data

In [None]:
# GCN encoder (run 1 due to 2 day limit)
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
edge_batch_size_str = "4096 4096 4096 4096 4096" # out of memory
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None" \
               " --seeds 7 6 5 4 3" \
               " --run_index 8 7 6 5 4" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 2 due to 2 day limit)
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
edge_batch_size_str = "4096 4096 4096" # out of memory
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None" \
               " --seeds 2 1 0" \
               " --run_index 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
# 50 epochs due to 2 day time limit
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
edge_batch_size_str = "512" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 7" \
               " --run_index 8" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 50" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
# 50 epochs due to 2 day time limit
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
edge_batch_size_str = "512" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 6" \
               " --run_index 7" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 50" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
# 50 epochs due to 2 day time limit
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
edge_batch_size_str = "512" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 5" \
               " --run_index 6" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 50" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
# 50 epochs due to 2 day time limit
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
edge_batch_size_str = "512" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 4" \
               " --run_index 5" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 50" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
edge_batch_size_str = "512 512" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 3
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 3 2" \
               " --run_index 4 3" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 6 due to 2 day limit)
task = "single_sample_method_benchmarking"
dataset = "vizgen_merfish_mouse_liver"
cell_type_key = "Cell_Type"
species = "mouse"
edge_batch_size_str = "512 512" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 4
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 1 0" \
               " --run_index 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.4.2 Spatial Transcriptomics Data Subsamples

In [None]:
# GCN encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "4096 4096 4096 4096 4096 4096 4096 4096" # out of memory
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
for subsample_pct in [50]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 7" \
                   " --run_index 8" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
for subsample_pct in [50]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 6" \
                   " --run_index 7" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
for subsample_pct in [50]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 5" \
                   " --run_index 6" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
for subsample_pct in [50]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 4" \
                   " --run_index 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
for subsample_pct in [50]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "512 512 512 512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 3
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None" \
                   " --seeds 3 2 1 0" \
                   " --run_index 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
for subsample_pct in [25]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "512 512 512 512 512 512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None" \
                   " --seeds 7 6 5 4 3 2" \
                   " --run_index 8 7 6 5 4 3" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
for subsample_pct in [25]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "512 512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None" \
                   " --seeds 1 0" \
                   " --run_index 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder
for subsample_pct in [10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"vizgen_merfish_mouse_liver_subsample_{subsample_pct}pct"
    cell_type_key = "Cell_Type"
    species = "mouse"
    edge_batch_size_str = "512 512 512 512 512 512 512 512" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.5 Slide-seqV2 Mouse Hippocampus

#### 3.5.1 Spatial Transcriptomics Data

In [None]:
# GCN encoder
task = "single_sample_method_benchmarking"
dataset = "slideseqv2_mouse_hippocampus"
cell_type_key = "cell_type"
species = "mouse"
edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384" # out of memory
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder
task = "single_sample_method_benchmarking"
dataset = "slideseqv2_mouse_hippocampus"
cell_type_key = "cell_type"
species = "mouse"
edge_batch_size_str = "256 256 256 256 256 256 256 256" # out of memory
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               " --reference_batches None" \
               " --counts_key counts" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --mapping_entity_key mapping_entity" \
               " --no-filter_genes" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               " --lambda_group_lasso 0." \
               " --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 3.5.2 Spatial Transcriptomics Data Subsamples

In [None]:
# GCN encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"slideseqv2_mouse_hippocampus_subsample_{subsample_pct}pct"
    cell_type_key = "cell_type"
    species = "mouse"
    edge_batch_size_str = "16384 16384 16384 16384 16384 16384 16384 16384" # out of memory
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "single_sample_method_benchmarking"
    dataset = f"slideseqv2_mouse_hippocampus_subsample_{subsample_pct}pct"
    cell_type_key = "cell_type"
    species = "mouse"
    edge_batch_size_str = "256 256 256 256 256 256 256 256" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# tmp
for subsample_pct in [50]:
    task = "single_sample_method_benchmarking"
    dataset = f"slideseqv2_mouse_hippocampus_subsample_{subsample_pct}pct"
    cell_type_key = "cell_type"
    species = "mouse"
    edge_batch_size_str = "256 256 256 256 256 256 256 256" # out of memory
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   " --reference_batches None" \
                   " --counts_key counts" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --mapping_entity_key mapping_entity" \
                   " --no-filter_genes" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   " --lambda_group_lasso 0." \
                   " --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 3.6 DeepLinc Models

### 3.7 Metrics Computation

#### 3.6.1 NicheCompass

In [None]:
# GCN encoder
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_embryo2",
                "seqfish_mouse_organogenesis_subsample_50pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_25pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_10pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_5pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_1pct_embryo2"]:    
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_nichecompass_gcnconv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GATv2 encoder
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_embryo2",
                "seqfish_mouse_organogenesis_subsample_50pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_25pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_10pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_5pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_1pct_embryo2"]:    
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_nichecompass_gatv2conv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GCN encoder
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc_batch5",
                "nanostring_cosmx_human_nsclc_subsample_50pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_25pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_10pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_5pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_1pct_batch5"]:    
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_nichecompass_gcnconv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GATv2 encoder
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc_batch5",
                "nanostring_cosmx_human_nsclc_subsample_50pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_25pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_10pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_5pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_1pct_batch5"]:    
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_nichecompass_gatv2conv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GCN encoder
job_id = 1
for dataset in ["vizgen_merfish_mouse_liver",
                "vizgen_merfish_mouse_liver_subsample_50pct",
                "vizgen_merfish_mouse_liver_subsample_25pct",
                "vizgen_merfish_mouse_liver_subsample_10pct",
                "vizgen_merfish_mouse_liver_subsample_5pct",
                "vizgen_merfish_mouse_liver_subsample_1pct"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_nichecompass_gcnconv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GATv2 encoder
job_id = 1
for dataset in ["vizgen_merfish_mouse_liver",
                "vizgen_merfish_mouse_liver_subsample_50pct",
                "vizgen_merfish_mouse_liver_subsample_25pct",
                "vizgen_merfish_mouse_liver_subsample_10pct",
                "vizgen_merfish_mouse_liver_subsample_5pct",
                "vizgen_merfish_mouse_liver_subsample_1pct"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_nichecompass_gatv2conv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GCN encoder
job_id = 1
for dataset in ["slideseqv2_mouse_hippocampus",
                "slideseqv2_mouse_hippocampus_subsample_50pct",
                "slideseqv2_mouse_hippocampus_subsample_25pct",
                "slideseqv2_mouse_hippocampus_subsample_10pct",
                "slideseqv2_mouse_hippocampus_subsample_5pct",
                "slideseqv2_mouse_hippocampus_subsample_1pct"]:    
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_nichecompass_gcnconv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_gcnconv_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GATv2 encoder
job_id = 1
for dataset in ["slideseqv2_mouse_hippocampus",
                "slideseqv2_mouse_hippocampus_subsample_50pct",
                "slideseqv2_mouse_hippocampus_subsample_25pct",
                "slideseqv2_mouse_hippocampus_subsample_10pct",
                "slideseqv2_mouse_hippocampus_subsample_5pct",
                "slideseqv2_mouse_hippocampus_subsample_1pct"]:    
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_nichecompass_gatv2conv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"nichecompass_gcnconv_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

#### 3.6.2 GraphST

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_embryo2",
                "seqfish_mouse_organogenesis_subsample_50pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_25pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_10pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_5pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_1pct_embryo2"]:  
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_graphst.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"graphst_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["vizgen_merfish_mouse_liver_subsample_10pct",
                "vizgen_merfish_mouse_liver_subsample_5pct",
                "vizgen_merfish_mouse_liver_subsample_1pct"]:  
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_graphst.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"graphst_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc_subsample_50pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_25pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_10pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_5pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_1pct_batch5"]:
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_graphst.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"graphst_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["slideseqv2_mouse_hippocampus",
                "slideseqv2_mouse_hippocampus_subsample_50pct",
                "slideseqv2_mouse_hippocampus_subsample_25pct",
                "slideseqv2_mouse_hippocampus_subsample_10pct",
                "slideseqv2_mouse_hippocampus_subsample_5pct"]:
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_graphst.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"graphst_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

#### 3.6.3 DeepLinc

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_embryo2",
                "seqfish_mouse_organogenesis_subsample_50pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_25pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_10pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_5pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_1pct_embryo2"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_deeplinc.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"deeplinc_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key deeplinc_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc_subsample_50pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_25pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_10pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_5pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_1pct_batch5"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_deeplinc.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"deeplinc_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key deeplinc_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["slideseqv2_mouse_hippocampus",
                "slideseqv2_mouse_hippocampus_subsample_50pct",
                "slideseqv2_mouse_hippocampus_subsample_25pct",
                "slideseqv2_mouse_hippocampus_subsample_10pct",
                "slideseqv2_mouse_hippocampus_subsample_5pct",
                "slideseqv2_mouse_hippocampus_subsample_1pct"]:  
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_deeplinc.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"deeplinc_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key deeplinc_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

#### 3.6.4 SageNet

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_embryo2",
                "seqfish_mouse_organogenesis_subsample_50pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_25pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_10pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_5pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_1pct_embryo2"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_sagenet.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"sagenet_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key sagenet_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc_subsample_50pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_25pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_10pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_5pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_1pct_batch5"
               ]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_sagenet.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"sagenet_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key sagenet_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["vizgen_merfish_mouse_liver_subsample_10pct",
                "vizgen_merfish_mouse_liver_subsample_5pct",
                "vizgen_merfish_mouse_liver_subsample_1pct"
               ]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_sagenet.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"sagenet_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key sagenet_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["slideseqv2_mouse_hippocampus"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_sagenet.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"sagenet_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key sagenet_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=1000)

#### 3.6.5 scVI

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_embryo2",
                "seqfish_mouse_organogenesis_subsample_50pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_25pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_10pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_5pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_1pct_embryo2"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_scvi.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"scvi_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key scvi_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc_batch5",
                "nanostring_cosmx_human_nsclc_subsample_50pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_25pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_10pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_5pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_1pct_batch5"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_scvi.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"scvi_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key scvi_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["vizgen_merfish_mouse_liver",
                "vizgen_merfish_mouse_liver_subsample_50pct",
                "vizgen_merfish_mouse_liver_subsample_25pct",
                "vizgen_merfish_mouse_liver_subsample_10pct",
                "vizgen_merfish_mouse_liver_subsample_5pct",
                "vizgen_merfish_mouse_liver_subsample_1pct"]:    
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_scvi.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"scvi_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key scvi_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["slideseqv2_mouse_hippocampus",
                "slideseqv2_mouse_hippocampus_subsample_50pct",
                "slideseqv2_mouse_hippocampus_subsample_25pct",
                "slideseqv2_mouse_hippocampus_subsample_10pct",
                "slideseqv2_mouse_hippocampus_subsample_5pct",
                "slideseqv2_mouse_hippocampus_subsample_1pct"]:    
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_scvi.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"scvi_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key scvi_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

#### 3.6.6 expiMap

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_embryo2",
                "seqfish_mouse_organogenesis_subsample_50pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_25pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_10pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_5pct_embryo2",
                "seqfish_mouse_organogenesis_subsample_1pct_embryo2"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_expimap.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"expimap_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key expimap_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc_batch5",
                "nanostring_cosmx_human_nsclc_subsample_50pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_25pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_10pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_5pct_batch5",
                "nanostring_cosmx_human_nsclc_subsample_1pct_batch5"]:   
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_expimap.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"expimap_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key expimap_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["vizgen_merfish_mouse_liver",
                "vizgen_merfish_mouse_liver_subsample_50pct",
                "vizgen_merfish_mouse_liver_subsample_25pct",
                "vizgen_merfish_mouse_liver_subsample_10pct",
                "vizgen_merfish_mouse_liver_subsample_5pct",
                "vizgen_merfish_mouse_liver_subsample_1pct"]: 
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_expimap.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"expimap_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key expimap_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["slideseqv2_mouse_hippocampus",
                "slideseqv2_mouse_hippocampus_subsample_50pct",
                "slideseqv2_mouse_hippocampus_subsample_25pct",
                "slideseqv2_mouse_hippocampus_subsample_10pct",
                "slideseqv2_mouse_hippocampus_subsample_5pct",
                "slideseqv2_mouse_hippocampus_subsample_1pct"
               ]:  
    task = "single_sample_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "None"
    file_name = f"{dataset}_expimap.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi"

    job_name_prefix = f"expimap_single_sample_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key expimap_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

## 4. NicheCompass Sample Integration Method Benchmarking

### 4.1 seqFISH Mouse Organogenesis

#### 4.1.1 Spatial Transcriptomics Data

In [None]:
# GCN encoder
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "65536 65536 65536 65536 65536 65536 65536 65536"
conv_layer_encoder = "gcnconv"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = True
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "256 256 256 256 256 256 256 256"
conv_layer_encoder = "gatv2conv"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = True
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 4.1.2 Spatial Transcriptomics Data Subsamples

In [None]:
# GCN encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "65536 65536 65536 65536 65536 65536 65536 65536"
    conv_layer_encoder = "gcnconv"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = True
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "512 512 512 512 512 512 512 512"
    conv_layer_encoder = "gatv2conv"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = True
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# tmp
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "512 512 512 512 512 512 512 512"
    conv_layer_encoder = "gatv2conv"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = True
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 4.2 STARmap PLUS Mouse CNS

In [None]:
0.0625 / 2 / 2

In [None]:
# GCN encoder
task = "sample_integration_method_benchmarking"
dataset = "starmap_plus_mouse_cns"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
edge_batch_size_str = "4096" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 3
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 1" \
               " --run_index 2" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder
task = "sample_integration_method_benchmarking"
dataset = "starmap_plus_mouse_cns"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "Main_molecular_cell_type"
species = "mouse"
edge_batch_size_str = "512" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 4
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 1" \
               " --run_index 2" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "starmap_plus_mouse_cns"
job_id = 2
reference_batches = "batch1 batch2 batch3"
n_neighbors = 8 # 8, 12
n_sampled_neighbors = 4
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 512 # 2048
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0. # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 4.2 seqFISH Mouse Organogenesis Imputed

This is not used for the manuscript.

#### 4.2.1 Spatial Transcriptomics Data

In [None]:
# GATv2 encoder
task = "sample_integration_method_benchmarking"
dataset = "visium_mouse_brain"
reference_batches = "batch1 batch2"
cell_type_key = "cell_type"
species = "mouse"
edge_batch_size_str = "256 256 256 256 256 256 256 256" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "2"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 5000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_imputed"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "8192 8192 8192 8192 8192 8192 8192 8192" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16 16 12 12 8 8 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None None None None None None None" \
               " --seeds 7 6 5 4 3 2 1 0" \
               " --run_index 8 7 6 5 4 3 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 3000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_imputed"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "256" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 7" \
               " --run_index 8" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 3000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_imputed"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "256" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 6" \
               " --run_index 7" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 3000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_imputed"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "256" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 5" \
               " --run_index 6" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 3000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_imputed"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "256" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 4" \
               " --run_index 5" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 3000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_imputed"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "256 256" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 3 2" \
               " --run_index 4 3" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 3000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 6 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "seqfish_mouse_organogenesis_imputed"
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
cell_type_key = "celltype_mapped_refined"
species = "mouse"
edge_batch_size_str = "256 256" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 1 0" \
               " --run_index 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 3000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 4.2.2 Spatial Transcriptomics Data Subsamples

In [None]:
# GCN encoder
for subsample_pct in [50, 25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_imputed_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "8192 8192 8192 8192 8192 8192 8192 8192" # out of memory 
    cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --filter_genes" \
                   f" --n_svg 3000" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# tmp
for subsample_pct in [5]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_imputed_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "8192 8192 8192 8192 8192 8192 8192 8192" # out of memory 
    cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --filter_genes" \
                   f" --n_svg 3000" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_imputed_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "256 256" # out of memory 
    cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None" \
                   " --seeds 7 6" \
                   " --run_index 8 7" \
                   f" --cell_type_key {cell_type_key}" \
                   " --filter_genes" \
                   f" --n_svg 3000" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_imputed_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "256 256" # out of memory 
    cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None" \
                   " --seeds 5 4" \
                   " --run_index 6 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --filter_genes" \
                   f" --n_svg 3000" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_imputed_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "256 256 256 256" # out of memory 
    cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None" \
                   " --seeds 3 2 1 0" \
                   " --run_index 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --filter_genes" \
                   f" --n_svg 3000" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder
for subsample_pct in [25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"seqfish_mouse_organogenesis_imputed_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
    cell_type_key = "celltype_mapped_refined"
    species = "mouse"
    edge_batch_size_str = "256 256 256 256 256 256 256 256" # out of memory 
    cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --filter_genes" \
                   f" --n_svg 3000" \
                   " --nichenet_keep_target_genes_ratio 1." \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   " --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

### 4.3 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

#### 4.3.1 Spatial Transcriptomics Data

##### 4.3.1.1 GCN W/O FOV Embedding

In [None]:
# GCN encoder (run 1 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 7" \
               " --run_index 8" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 2 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 6" \
               " --run_index 7 " \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 3 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 5" \
               " --run_index 6 " \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 4 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 4" \
               " --run_index 5 " \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 5 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192 8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 3 2" \
               " --run_index 4 3" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 6 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192 8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 1 0" \
               " --run_index 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

##### 4.3.1.2 GCN with FOV Embedding

In [None]:
# GCN encoder (run 1 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 7" \
               " --run_index 8" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 2 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 6" \
               " --run_index 7" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 3 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 3
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 5" \
               " --run_index 6" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 4 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 4
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 4" \
               " --run_index 5" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 5 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192 8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 5
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 3 2" \
               " --run_index 4 3" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GCN encoder (run 6 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "8192 8192"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 6
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 1 0" \
               " --run_index 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

##### 4.3.1.3 GCN with FOV Embedding & Contrastive Loss

In [73]:
8192 / 2

4096.0

In [74]:
# GCN encoder (run 1 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "4096"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 7" \
               " --run_index 8" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13308726


In [75]:
# GCN encoder (run 2 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "4096"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 6" \
               " --run_index 7" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13308727


In [76]:
# GCN encoder (run 3 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "4096"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 3
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 5" \
               " --run_index 6" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13308728


In [77]:
# GCN encoder (run 4 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "4096"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 4
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 4" \
               " --run_index 5" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13308729


In [78]:
# GCN encoder (run 5 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "4096 4096"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 5
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 3 2" \
               " --run_index 4 3" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13308730


In [79]:
# GCN encoder (run 6 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
edge_batch_size_str = "4096 4096"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gcnconv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 6
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 1 0" \
               " --run_index 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13308731


##### 4.3.1.4 GATv2 W/O FOV Embedding

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "512"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 7" \
               " --run_index 8" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "512"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 6" \
               " --run_index 7" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "512"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 5" \
               " --run_index 6" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "512"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 4" \
               " --run_index 5" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "512 512"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 3 2" \
               " --run_index 4 3" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 6 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "512 512"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 1 0" \
               " --run_index 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

##### 4.3.1.5 GATv2 with FOV Embedding

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 7" \
               " --run_index 8" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 6" \
               " --run_index 7" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 3
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 5" \
               " --run_index 6" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 4
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 4" \
               " --run_index 5" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256 256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 5
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 3 2" \
               " --run_index 4 3" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 6 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256 256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 6
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 1 0" \
               " --run_index 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

##### 4.3.1.6 GATv2 with FOV Embedding & Contrastive Loss

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 1
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 7" \
               " --run_index 8" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 2
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 16" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 6" \
               " --run_index 7" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 3
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 5" \
               " --run_index 6" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 4
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 12" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 4" \
               " --run_index 5" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256 256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 5
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 8 8" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 3 2" \
               " --run_index 4 3" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
# GATv2 encoder (run 6 due to 2 day limit)
task = "sample_integration_method_benchmarking"
dataset = "nanostring_cosmx_human_nsclc"
reference_batches = "batch1 batch2 batch3"
cell_type_key = "cell_type"
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size_str = "256 256"
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 6
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 4 4" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None None" \
               " --seeds 1 0" \
               " --run_index 2 1" \
               f" --cell_type_key {cell_type_key}" \
               " --no-filter_genes" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               f" --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 4" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 4.3.2 Spatial Transcriptomics Data Subsamples

##### 4.3.2.1 GCN W/O FOV Embedding

In [None]:
# GCN encoder (run 1 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 7" \
                   " --run_index 8" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GCN encoder (run 2 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 6" \
                   " --run_index 7" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GCN encoder (run 3 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192 8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None" \
                   " --seeds 5 4" \
                   " --run_index 6 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GCN encoder (run 4 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192 8192 8192 8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None" \
                   " --seeds 3 2 1 0" \
                   " --run_index 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

##### 4.3.2.2 GCN with FOV Embedding

In [None]:
# GCN encoder (run 1 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 7" \
                   " --run_index 8" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GCN encoder (run 2 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 2
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 6" \
                   " --run_index 7" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GCN encoder (run 3 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 3
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 5" \
                   " --run_index 6" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GCN encoder (run 4 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 4
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 4" \
                   " --run_index 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GCN encoder (run 5 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192 8192 8192 8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 5
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None" \
                   " --seeds 3 2 1 0" \
                   " --run_index 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GCN encoder
for subsample_pct in [25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "8192 8192 8192 8192 8192 8192 8192 8192"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

##### 4.3.2.2 GCN with FOV Embedding & Contrastive Loss

In [80]:
# GCN encoder (run 1 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "4096"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 7" \
                   " --run_index 8" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13308732


In [81]:
# GCN encoder (run 2 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "4096"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 2
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 6" \
                   " --run_index 7" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13308733


In [82]:
# GCN encoder (run 3 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "4096"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 3
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 5" \
                   " --run_index 6" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13308734


In [83]:
# GCN encoder (run 4 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "4096"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 4
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 4" \
                   " --run_index 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13308735


In [84]:
# GCN encoder (run 5 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "4096 4096 4096 4096"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 5
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None" \
                   " --seeds 3 2 1 0" \
                   " --run_index 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13308736


In [85]:
# GCN encoder
for subsample_pct in [25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "4096 4096 4096 4096 4096 4096 4096 4096"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gcnconv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13308737
Submitted batch job 13308738
Submitted batch job 13308739
Submitted batch job 13308740


##### 4.3.2.3 GATv2 W/O FOV Embedding

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "512"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 7" \
                   " --run_index 8" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "512"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 6" \
                   " --run_index 7" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "512"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 5" \
                   " --run_index 6" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "512"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 4" \
                   " --run_index 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "512 512 512 512"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None" \
                   " --seeds 3 2 1 0" \
                   " --run_index 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder
for subsample_pct in [25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256 256 256 256 256 256 256 256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch"
    cat_covariates_no_edges = "True"
    cat_covariates_embeds_nums = "3"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

##### 4.3.2.1 GATv2 with FOV Embedding

In [None]:
# GATv2 encoder (run 1 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 7" \
                   " --run_index 8" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 2 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 2
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 6" \
                   " --run_index 7" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 3
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 5" \
                   " --run_index 6" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 4
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 4" \
                   " --run_index 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256 256 256 256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 5
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None" \
                   " --seeds 3 2 1 0" \
                   " --run_index 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder
for subsample_pct in [25, 10, 5, 1]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256 256 256 256 256 256 256 256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 0.
    contrastive_logits_pos_ratio = 0.
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

##### 4.3.2.1 GATv2 with FOV Embedding & Contrastive Loss

In [53]:
# GATv2 encoder (run 1 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 7" \
                   " --run_index 8" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13297818


In [54]:
# GATv2 encoder (run 2 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 2
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 6" \
                   " --run_index 7" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13297819


In [None]:
# GATv2 encoder (run 3 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 3
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 5" \
                   " --run_index 6" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 4 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 4
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 4" \
                   " --run_index 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [None]:
# GATv2 encoder (run 5 due to 2 day limit)
for subsample_pct in [50]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256 256 256 256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 5
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None" \
                   " --seeds 3 2 1 0" \
                   " --run_index 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

In [14]:
# tmp
for subsample_pct in [25]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 4
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 12" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None" \
                   " --seeds 4" \
                   " --run_index 5" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13309729


In [11]:
# tmp
for subsample_pct in [25]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256 256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 8 8" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None" \
                   " --seeds 3 2" \
                   " --run_index 4 3" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13309692


In [12]:
# tmp
for subsample_pct in [25]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256 256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 2
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None" \
                   " --seeds 1 0" \
                   " --run_index 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13309693


In [72]:
# GATv2 encoder
for subsample_pct in [5]:
    task = "sample_integration_method_benchmarking"
    dataset = f"nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct"
    reference_batches = "batch1 batch2 batch3"
    cell_type_key = "cell_type"
    species = "human"
    edge_batch_size_str = "256 256 256 256 256 256 256 256"
    cat_covariates_embeds_injection = "gene_expr_decoder"
    cat_covariates_keys = "batch fov"
    cat_covariates_no_edges = "True False"
    cat_covariates_embeds_nums = "3 30"
    lambda_cat_covariates_contrastive = 1000000.0
    contrastive_logits_pos_ratio = 0.0625
    contrastive_logits_neg_ratio = 0.
    conv_layer_encoder = "gatv2conv"

    job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
    job_id = 1
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "train_nichecompass_benchmarking_models.py"
    script_args =  " --adata_new_name None" \
                   " --n_neighbors_list 16 16 12 12 8 8 4 4" \
                   f" --edge_batch_size_list {edge_batch_size_str}" \
                   " --node_batch_size_list None None None None None None None None" \
                   " --seeds 7 6 5 4 3 2 1 0" \
                   " --run_index 8 7 6 5 4 3 2 1" \
                   f" --cell_type_key {cell_type_key}" \
                   " --no-filter_genes" \
                   " --nichenet_keep_target_genes_ratio 1.0" \
                   " --nichenet_max_n_target_genes_per_gp 250" \
                   " --include_mebocost_gps" \
                   f" --species {species}" \
                   " --gp_filter_mode subset" \
                   " --combine_overlap_gps" \
                   " --overlap_thresh_source_genes 0.9" \
                   " --overlap_thresh_target_genes 0.9" \
                   " --overlap_thresh_genes 0.9" \
                   f" --dataset {dataset}" \
                   f" --reference_batches {reference_batches}" \
                   " --counts_key counts" \
                   f" --cat_covariates_keys {cat_covariates_keys}" \
                   f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                   " --spatial_key spatial" \
                   " --adj_key spatial_connectivities" \
                   " --gp_targets_mask_key nichecompass_gp_targets" \
                   " --gp_sources_mask_key nichecompass_gp_sources" \
                   " --gp_names_key nichecompass_gp_names" \
                   f" --model_label {conv_layer_encoder}_{task}" \
                   " --active_gp_names_key nichecompass_active_gp_names" \
                   " --latent_key nichecompass_latent" \
                   " --n_addon_gp 10" \
                   " --active_gp_thresh_ratio 0." \
                   " --gene_expr_recon_dist nb" \
                   f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                   f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                   " --log_variational" \
                   f" --node_label_method one-hop-norm" \
                   " --n_layers_encoder 1" \
                   " --n_hidden_encoder None" \
                   f" --conv_layer_encoder {conv_layer_encoder}" \
                   " --n_epochs 400" \
                   " --n_epochs_all_gps 25" \
                   " --n_epochs_no_cat_covariates_contrastive 0" \
                   " --lr 0.001" \
                   " --lambda_edge_recon 5000000." \
                   " --lambda_gene_expr_recon 3000." \
                   f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                   f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                   f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                   " --lambda_group_lasso 0." \
                   f" --lambda_l1_masked 0." \
                   " --lambda_l1_addon 0." \
                   f" --n_sampled_neighbors 4" \
                   f" --timestamp_suffix _{job_id}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args,
            nice=10000)

Submitted batch job 13308724


### 4.4 Metrics Computation

#### 4.4.1 NicheCompass

In [None]:
# GCN encoder
job_id = 1
for dataset in ["seqfish_mouse_organogenesis",
                "seqfish_mouse_organogenesis_subsample_50pct",
                "seqfish_mouse_organogenesis_subsample_25pct",
                "seqfish_mouse_organogenesis_subsample_10pct",
                "seqfish_mouse_organogenesis_subsample_5pct",
                "seqfish_mouse_organogenesis_subsample_1pct"]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gcnconv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GATv2 encoder
job_id = 1
for dataset in ["seqfish_mouse_organogenesis",
                "seqfish_mouse_organogenesis_subsample_50pct",
                "seqfish_mouse_organogenesis_subsample_25pct",
                "seqfish_mouse_organogenesis_subsample_10pct",
                "seqfish_mouse_organogenesis_subsample_5pct",
                "seqfish_mouse_organogenesis_subsample_1pct"
]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gatv2conv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GCN encoder
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_imputed",
                "seqfish_mouse_organogenesis_imputed_subsample_50pct",
                "seqfish_mouse_organogenesis_imputed_subsample_25pct",
                "seqfish_mouse_organogenesis_imputed_subsample_10pct",
                "seqfish_mouse_organogenesis_imputed_subsample_5pct",
                "seqfish_mouse_organogenesis_imputed_subsample_1pct"]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gcnconv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# GATv2 encoder
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_imputed",
                "seqfish_mouse_organogenesis_imputed_subsample_50pct",
                "seqfish_mouse_organogenesis_imputed_subsample_25pct",
                "seqfish_mouse_organogenesis_imputed_subsample_10pct",
                "seqfish_mouse_organogenesis_imputed_subsample_5pct",
                "seqfish_mouse_organogenesis_imputed_subsample_1pct"]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gatv2conv.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [86]:
# GCN encoder W/O FoV embedding
job_id = 8
for dataset in [#"nanostring_cosmx_human_nsclc",
                "nanostring_cosmx_human_nsclc_subsample_50pct",
                #"nanostring_cosmx_human_nsclc_subsample_25pct",
                #"nanostring_cosmx_human_nsclc_subsample_10pct",
                #"nanostring_cosmx_human_nsclc_subsample_5pct",
                #"nanostring_cosmx_human_nsclc_subsample_1pct"
               ]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gcnconv_part8.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  " --run_numbers 8" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

Submitted batch job 13317676


In [46]:
# GCN encoder with FoV embedding
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc",
                #"nanostring_cosmx_human_nsclc_subsample_50pct",
                #"nanostring_cosmx_human_nsclc_subsample_25pct",
                #"nanostring_cosmx_human_nsclc_subsample_10pct",
                #"nanostring_cosmx_human_nsclc_subsample_5pct",
                #"nanostring_cosmx_human_nsclc_subsample_1pct"
]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gcnconv_fov_part2.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  " --run_numbers 8" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

Submitted batch job 13310053


In [73]:
# GCN encoder with contrastive loss
job_id = 8
for dataset in [#"nanostring_cosmx_human_nsclc",
                "nanostring_cosmx_human_nsclc_subsample_50pct",
                #"nanostring_cosmx_human_nsclc_subsample_25pct",
                #"nanostring_cosmx_human_nsclc_subsample_10pct",
                #"nanostring_cosmx_human_nsclc_subsample_5pct",
                #"nanostring_cosmx_human_nsclc_subsample_1pct"
]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gcnconv_cont_part8.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  " --run_numbers 8" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

Submitted batch job 13316864


In [67]:
# GATv2 encoder W/O FoV embedding
job_id = 3
for dataset in ["nanostring_cosmx_human_nsclc",
                #"nanostring_cosmx_human_nsclc_subsample_50pct",
                #"nanostring_cosmx_human_nsclc_subsample_25pct",
                #"nanostring_cosmx_human_nsclc_subsample_10pct",
                #"nanostring_cosmx_human_nsclc_subsample_5pct",
                #"nanostring_cosmx_human_nsclc_subsample_1pct"
               ]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gatv2conv_part2.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  " --run_numbers 7" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

Submitted batch job 13308702


In [95]:
# GATv2 encoder with FoV embedding
job_id = 8
for dataset in [#"nanostring_cosmx_human_nsclc",
                #"nanostring_cosmx_human_nsclc_subsample_50pct",
                #"nanostring_cosmx_human_nsclc_subsample_25pct",
                #"nanostring_cosmx_human_nsclc_subsample_10pct",
                #"nanostring_cosmx_human_nsclc_subsample_5pct",
                #"nanostring_cosmx_human_nsclc_subsample_1pct"
               ]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gatv2conv_fov_part8.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  " --run_numbers 8" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

Submitted batch job 13317730


In [96]:
# GATv2 encoder with contrastive loss
job_id = 1
for dataset in [#"nanostring_cosmx_human_nsclc",
                #"nanostring_cosmx_human_nsclc_subsample_50pct",
                #"nanostring_cosmx_human_nsclc_subsample_25pct",
                "nanostring_cosmx_human_nsclc_subsample_10pct",
                "nanostring_cosmx_human_nsclc_subsample_5pct",
                "nanostring_cosmx_human_nsclc_subsample_1pct"
               ]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_nichecompass_gatv2conv_cont.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"nichecompass_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key nichecompass_latent" \
                  " --run_numbers 1 2 3 4 5 6 7 8" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

Submitted batch job 13317748
Submitted batch job 13317749
Submitted batch job 13317750


#### 4.4.2 GraphST

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis",
                "seqfish_mouse_organogenesis_subsample_50pct",
                "seqfish_mouse_organogenesis_subsample_25pct",
                "seqfish_mouse_organogenesis_subsample_10pct",
                "seqfish_mouse_organogenesis_subsample_5pct",
                "seqfish_mouse_organogenesis_subsample_1pct"]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_graphst.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"graphst_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# PASTE alignment
job_id =2
for dataset in ["seqfish_mouse_organogenesis",
                "seqfish_mouse_organogenesis_subsample_50pct",
                "seqfish_mouse_organogenesis_subsample_25pct",
                "seqfish_mouse_organogenesis_subsample_10pct",
                "seqfish_mouse_organogenesis_subsample_5pct",
                "seqfish_mouse_organogenesis_subsample_1pct"
               ]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_graphst_paste.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"graphst_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_imputed",
                "seqfish_mouse_organogenesis_imputed_subsample_50pct",
                "seqfish_mouse_organogenesis_imputed_subsample_25pct",
                "seqfish_mouse_organogenesis_imputed_subsample_10pct",
                "seqfish_mouse_organogenesis_imputed_subsample_5pct",
                "seqfish_mouse_organogenesis_imputed_subsample_1pct"
               ]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_graphst.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"graphst_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
# PASTE alignment
job_id = 2
for dataset in ["seqfish_mouse_organogenesis_imputed",
                #"seqfish_mouse_organogenesis_imputed_subsample_50pct",
                #"seqfish_mouse_organogenesis_imputed_subsample_25pct",
                #"seqfish_mouse_organogenesis_imputed_subsample_10pct",
                #"seqfish_mouse_organogenesis_imputed_subsample_5pct",
                #"seqfish_mouse_organogenesis_imputed_subsample_1pct"
               ]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_graphst_paste.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"graphst_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc_subsample_10pct",
                "nanostring_cosmx_human_nsclc_subsample_5pct",
                "nanostring_cosmx_human_nsclc_subsample_1pct"]:     
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_graphst.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"graphst_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
import scanpy as sc
for subsample_pct in [1, 5, 10]:
    adata = sc.read_h5ad(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct_graphst_paste.h5ad")
    adata.obs_names_make_unique()
    adata.write(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_subsample_{subsample_pct}pct_graphst_paste.h5ad")

In [None]:
# PASTE alignment
job_id = 2
for dataset in ["nanostring_cosmx_human_nsclc_subsample_10pct",
                "nanostring_cosmx_human_nsclc_subsample_5pct",
                "nanostring_cosmx_human_nsclc_subsample_1pct"]:     
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_graphst_paste.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"graphst_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key graphst_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

#### 4.4.3 scVI

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis",
                "seqfish_mouse_organogenesis_subsample_50pct",
                "seqfish_mouse_organogenesis_subsample_25pct",
                "seqfish_mouse_organogenesis_subsample_10pct",
                "seqfish_mouse_organogenesis_subsample_5pct",
                "seqfish_mouse_organogenesis_subsample_1pct"]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_scvi.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"scvi_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key scvi_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
job_id = 1
for dataset in ["seqfish_mouse_organogenesis_imputed",
                "seqfish_mouse_organogenesis_imputed_subsample_50pct",
                "seqfish_mouse_organogenesis_imputed_subsample_25pct",
                "seqfish_mouse_organogenesis_imputed_subsample_10pct",
                "seqfish_mouse_organogenesis_imputed_subsample_5pct",
                "seqfish_mouse_organogenesis_imputed_subsample_1pct"]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_scvi.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"scvi_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key scvi_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

In [None]:
import scanpy as sc
for i in [50, 25, 10, 5, 1]:
    adata = sc.read_h5ad(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_subsample_{i}pct_scvi.h5ad")
    adata.obs_names_make_unique()
    #adata.write(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_subsample_{i}pct_scvi.h5ad")

In [None]:
adata = sc.read_h5ad(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_scvi.h5ad")
adata.obs_names_make_unique()
adata.write(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_scvi.h5ad")

In [None]:
import scanpy as sc
for i in [50, 25, 10, 5, 1]:
    adata = sc.read_h5ad(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_subsample_{i}pct_nichecompass_gcnconv_fov.h5ad")
    adata.obs_names_make_unique()
    adata.write(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_subsample_{i}pct_nichecompass_gcnconv_fov.h5ad")

In [None]:
adata = sc.read_h5ad(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_nichecompass_gcnconv_fov.h5ad")
adata.obs_names_make_unique()
adata.write(f"../artifacts/sample_integration_method_benchmarking/nanostring_cosmx_human_nsclc_nichecompass_gcnconv_fov.h5ad")

In [None]:
job_id = 1
for dataset in ["nanostring_cosmx_human_nsclc",
                "nanostring_cosmx_human_nsclc_subsample_50pct",
                "nanostring_cosmx_human_nsclc_subsample_25pct",
                "nanostring_cosmx_human_nsclc_subsample_10pct",
                "nanostring_cosmx_human_nsclc_subsample_5pct",
                "nanostring_cosmx_human_nsclc_subsample_1pct"]:   
    task = "sample_integration_method_benchmarking"
    cell_type_key = "cell_type"
    batch_key = "batch"
    file_name = f"{dataset}_scvi.h5ad"
    metrics = "gcs mlami cas clisis nasw cnmi cari casw clisi basw bgc blisi"

    job_name_prefix = f"scvi_sample_integration_method_benchmarking_{dataset}_metrics_computation"
    job_folder_path = f"../scripts/{task}/slurm_jobs"
    script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
    script_name = "compute_benchmarking_metrics.py"
    script_args = f" --dataset {dataset}" \
                  f" --task {task}" \
                  f" --file_name {file_name}" \
                  f" --cell_type_key {cell_type_key}" \
                  f" --batch_key {batch_key}" \
                  f" --latent_key scvi_latent" \
                  f" --metrics {metrics}"

    submit_python_script(
            job_name_prefix=job_name_prefix,
            job_id=job_id,
            job_folder_path=job_folder_path,
            conda_env_name=reproducibility_conda_env_name,
            script_folder_path=script_folder_path,
            script_name=script_name,
            script_args=script_args)

## 5. NicheCompass Reference Model

### 5.1 seqFISH Mouse Organogenesis Imputed

In [None]:
job_id = 1
for lambda_edge_recon, lambda_gene_expr_recon in zip([5000000], [3000]):
    for active_gp_thresh_ratio in [0.]:
        for lambda_l1_masked, lambda_l1_addon in zip([0], [0]):
            for n_addon_gp in [10]:
                for n_svg in [3000]:
                    for conv_layer_encoder in ["gcnconv"]:
                        for n_neighbors in [4]:
                            task = "reference"
                            dataset = "seqfish_mouse_organogenesis_imputed"
                            reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6"
                            n_sampled_neighbors = 4
                            n_hvg = 0
                            species = "mouse"
                            node_label_method = "one-hop-norm"
                            edge_batch_size = "256" # "512"
                            cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
                            cat_covariates_keys = "batch"
                            cat_covariates_no_edges = "True"
                            cat_covariates_embeds_nums = "3"
                            lambda_cat_covariates_contrastive = 0.
                            contrastive_logits_pos_ratio = 0.
                            contrastive_logits_neg_ratio = 0.

                            job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
                            job_folder_path = f"../scripts/{task}/slurm_jobs"
                            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                            script_name = "train_nichecompass_reference_model.py"
                            script_args = f" --dataset {dataset}" \
                                          f" --reference_batches {reference_batches}" \
                                          f" --n_neighbors {n_neighbors}" \
                                          " --filter_genes" \
                                          f" --n_hvg {n_hvg}" \
                                          f" --n_svg {n_svg}" \
                                          " --nichenet_keep_target_genes_ratio 1.0" \
                                          " --nichenet_max_n_target_genes_per_gp 250" \
                                          " --include_mebocost_gps" \
                                          f" --species {species}" \
                                          " --gp_filter_mode subset" \
                                          " --combine_overlap_gps" \
                                          " --overlap_thresh_source_genes 0.9" \
                                          " --overlap_thresh_target_genes 0.9" \
                                          " --overlap_thresh_genes 0.9" \
                                          " --counts_key counts" \
                                          " --condition_key batch" \
                                          f" --cat_covariates_keys {cat_covariates_keys}" \
                                          f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                                          " --spatial_key spatial" \
                                          " --adj_key spatial_connectivities" \
                                          " --mapping_entity_key mapping_entity" \
                                          " --gp_targets_mask_key nichecompass_gp_targets" \
                                          " --gp_sources_mask_key nichecompass_gp_sources" \
                                          " --gp_names_key nichecompass_gp_names" \
                                          f" --model_label {task}" \
                                          " --active_gp_names_key nichecompass_active_gp_names" \
                                          " --latent_key nichecompass_latent" \
                                          f" --n_addon_gp {n_addon_gp}" \
                                          f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                                          " --gene_expr_recon_dist nb" \
                                          f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                                          f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                                          " --log_variational" \
                                          f" --node_label_method {node_label_method}" \
                                          " --n_layers_encoder 1" \
                                          " --n_hidden_encoder None" \
                                          f" --conv_layer_encoder {conv_layer_encoder}" \
                                          " --n_epochs 100" \
                                          " --n_epochs_all_gps 25" \
                                          " --n_epochs_no_cat_covariates_contrastive 0" \
                                          " --lr 0.001" \
                                          f" --lambda_edge_recon {lambda_edge_recon}" \
                                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                          f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                                          f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                                          f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                                          " --lambda_group_lasso 0." \
                                          f" --lambda_l1_masked {lambda_l1_masked}" \
                                          f" --lambda_l1_addon {lambda_l1_addon}." \
                                          f" --edge_batch_size {edge_batch_size}" \
                                          " --node_batch_size None" \
                                          f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                          f" --timestamp_suffix _{job_id}"

                            submit_python_script(
                                    job_name_prefix=job_name_prefix,
                                    job_id=job_id,
                                    job_folder_path=job_folder_path,
                                    conda_env_name=conda_env_name,
                                    script_folder_path=script_folder_path,
                                    script_name=script_name,
                                    script_args=script_args,
                                    nice=10000)

                            job_id += 1

In [None]:
# tmp
job_id = 9998
for lambda_edge_recon, lambda_gene_expr_recon in zip([5000000], [3000]):
    for active_gp_thresh_ratio in [0.01]:
        for lambda_l1_masked, lambda_l1_addon in zip([0], [0]):
            for n_addon_gp in [100]:
                for n_svg in [5000]:
                    for conv_layer_encoder in ["gcnconv"]:
                        for n_neighbors in [16]:
                            task = "reference"
                            dataset = "seqfish_mouse_organogenesis_imputed"
                            reference_batches = "batch3 batch4"
                            n_sampled_neighbors = 4
                            n_hvg = 0
                            species = "mouse"
                            node_label_method = "one-hop-norm"
                            edge_batch_size = "256" # "512"
                            cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
                            cat_covariates_keys = "None"
                            cat_covariates_no_edges = "True"
                            cat_covariates_embeds_nums = "3"
                            lambda_cat_covariates_contrastive = 0.
                            contrastive_logits_pos_ratio = 0.
                            contrastive_logits_neg_ratio = 0.

                            job_name_prefix = f"{dataset}_nichecompass_{task}"
                            job_folder_path = f"../scripts/{task}/slurm_jobs"
                            script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                            script_name = "train_nichecompass_reference_model.py"
                            script_args = f" --dataset {dataset}" \
                                          f" --reference_batches {reference_batches}" \
                                          f" --n_neighbors {n_neighbors}" \
                                          " --filter_genes" \
                                          f" --n_hvg {n_hvg}" \
                                          f" --n_svg {n_svg}" \
                                          " --nichenet_keep_target_genes_ratio 1.0" \
                                          " --nichenet_max_n_target_genes_per_gp 250" \
                                          " --include_mebocost_gps" \
                                          f" --species {species}" \
                                          " --gp_filter_mode subset" \
                                          " --combine_overlap_gps" \
                                          " --overlap_thresh_source_genes 0.9" \
                                          " --overlap_thresh_target_genes 0.9" \
                                          " --overlap_thresh_genes 0.9" \
                                          " --counts_key counts" \
                                          " --condition_key batch" \
                                          f" --cat_covariates_keys {cat_covariates_keys}" \
                                          f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                                          " --spatial_key spatial" \
                                          " --adj_key spatial_connectivities" \
                                          " --mapping_entity_key mapping_entity" \
                                          " --gp_targets_mask_key nichecompass_gp_targets" \
                                          " --gp_sources_mask_key nichecompass_gp_sources" \
                                          " --gp_names_key nichecompass_gp_names" \
                                          f" --model_label {task}" \
                                          " --active_gp_names_key nichecompass_active_gp_names" \
                                          " --latent_key nichecompass_latent" \
                                          f" --n_addon_gp {n_addon_gp}" \
                                          f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                                          " --gene_expr_recon_dist nb" \
                                          f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                                          f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                                          " --log_variational" \
                                          f" --node_label_method {node_label_method}" \
                                          " --n_layers_encoder 1" \
                                          " --n_hidden_encoder None" \
                                          f" --conv_layer_encoder {conv_layer_encoder}" \
                                          " --n_epochs 100" \
                                          " --n_epochs_all_gps 25" \
                                          " --n_epochs_no_cat_covariates_contrastive 0" \
                                          " --lr 0.001" \
                                          f" --lambda_edge_recon {lambda_edge_recon}" \
                                          f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                          f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                                          f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                                          f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                                          " --lambda_group_lasso 0." \
                                          f" --lambda_l1_masked {lambda_l1_masked}" \
                                          f" --lambda_l1_addon {lambda_l1_addon}." \
                                          f" --edge_batch_size {edge_batch_size}" \
                                          " --node_batch_size None" \
                                          f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                          f" --timestamp_suffix _{job_id}"

                            submit_python_script(
                                    job_name_prefix=job_name_prefix,
                                    job_id=job_id,
                                    job_folder_path=job_folder_path,
                                    conda_env_name=conda_env_name,
                                    script_folder_path=script_folder_path,
                                    script_name=script_name,
                                    script_args=script_args,
                                    nice=10000)

                            job_id += 1

### 5.2 STARmap PLUS Mouse Central Nervous System

In [None]:
task = "reference"
dataset = "starmap_plus_mouse_cns"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8" \
                    " batch9 batch10 batch11 batch12 batch13 batch14 batch15 batch16 batch17 batch18" \
                    " batch19 batch20"
n_neighbors = 8 # 8, 12
n_sampled_neighbors = 4
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 512 # 2048
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "20"
lambda_cat_covariates_contrastive = 0. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0. # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 35" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "starmap_plus_mouse_cns"
job_id = 2
reference_batches = "batch1 batch2 batch3"
n_neighbors = 8 # 8, 12
n_sampled_neighbors = 4
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 512 # 2048
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0. # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.3 nanoString CosMx SMI Human Non-Small-Cell Lung Cancer (NSCLC)

In [17]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 2
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8"
n_neighbors = 4
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 512 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch fov patient"
cat_covariates_no_edges = "True False True"
cat_covariates_embeds_nums = "3 30 5"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --n_addon_gp 100" \
              " --active_gp_thresh_ratio 0.03" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              " --lambda_l1_addon 1000." \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13308502


#### 5.3.2 Modified Reference

In [71]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc_modified"
job_id = 5
reference_batches = "batch1 batch2"
n_neighbors = 4
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 512 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 30"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --n_addon_gp 10" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              " --lambda_l1_addon 0." \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13308707


In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc_filtered"
job_id = 2
reference_batches = "batch1 batch2"
n_neighbors = 8 # 4, 8, 12
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch fov"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "3 20"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc_filtered"
job_id = 3
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8"
n_neighbors = 8 # 4, 8, 12
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc_filtered"
job_id = 4
reference_batches = "batch1 batch2 batch3 batch4 batch5 batch6 batch7 batch8"
n_neighbors = 8 # 4, 8, 12
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch fov patient"
cat_covariates_no_edges = "True False True"
cat_covariates_embeds_nums = "3 20 5"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.4 Vizgen MERFISH Human Ovarian Cancer

In [None]:
task = "reference"
dataset = "vizgen_merfish_human_ovarian_cancer"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4"
n_neighbors = 8 # 8, 12
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 2048
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "patient batch"
cat_covariates_no_edges = "True True"
cat_covariates_embeds_nums = "2 4"
lambda_cat_covariates_contrastive = 0. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0. # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

In [None]:
task = "reference"
dataset = "vizgen_merfish_human_ovarian_cancer"
job_id = 2
reference_batches = "batch1 batch2 batch3 batch4"
n_neighbors = 8 # 8, 12
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 1024 # 2048
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "4"
lambda_cat_covariates_contrastive = 0. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0. # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.4 Vizgen MERFISH Human Lung Cancer

In [None]:
task = "reference"
dataset = "vizgen_merfish_mouse_liver"
job_id = 1
reference_batches = "batch1 batch2"
n_neighbors = 4 # 8, 12
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 4096 # 2048
cat_covariates_embeds_injection = "encoder gene_expr_decoder"
cat_covariates_embeds_nums = "2"
lambda_cat_covariates_contrastive = 250000. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0.125 # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1." \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.5 Spatial ATAC-RNA-Seq Mouse Embryo & Brain

In [None]:
job_id = 1
for lambda_edge_recon, lambda_gene_expr_recon in zip([5000000, 5000000], [3000, 5000]):
    for lambda_chrom_access_recon in [1000, 3000, 5000]:
        for active_gp_thresh_ratio in [0.03, 0.05, 0.1]:
            for lambda_l1_masked, lambda_l1_addon in zip([0, 50], [0, 50]):
                for n_svg in [3000]:
                    for n_svp in [0, 3000, 15000]:
                        task = "reference"
                        dataset = "spatial_atac_rna_seq_mouse_brain_batch2"
                        reference_batches = "None"
                        n_neighbors = 8
                        species = "mouse"
                        node_label_method = "one-hop-sum"
                        conv_layer_encoder = "gatv2conv"
                        edge_batch_size = "256"
                        n_sampled_neighbors = 4
                        lambda_cat_covariates_contrastive = 0.
                        contrastive_logits_pos_ratio = 0.
                        contrastive_logits_neg_ratio = 0.
                        cat_covariates_embeds_injection = "None"
                        cat_covariates_keys = "batch"
                        cat_covariates_no_edges = "True"
                        cat_covariates_embeds_nums = 0
                        n_hvg = 0
                        n_addon_gp = 20

                        job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
                        job_folder_path = f"../scripts/{task}/slurm_jobs"
                        script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                        script_name = "train_nichecompass_reference_model.py"
                        script_args = f" --dataset {dataset}" \
                                      f" --reference_batches {reference_batches}" \
                                      f" --n_neighbors {n_neighbors}" \
                                      " --filter_genes" \
                                      f" --n_hvg {n_hvg}" \
                                      f" --n_svg {n_svg}" \
                                      f" --n_svp {n_svp}" \
                                      " --nichenet_keep_target_genes_ratio 1.0" \
                                      " --nichenet_max_n_target_genes_per_gp 250" \
                                      " --include_mebocost_gps" \
                                      " --include_collectri_gps" \
                                      " --include_brain_marker_gps" \
                                      f" --species {species}" \
                                      " --gp_filter_mode subset" \
                                      " --combine_overlap_gps" \
                                      " --overlap_thresh_source_genes 0.9" \
                                      " --overlap_thresh_target_genes 0.9" \
                                      " --overlap_thresh_genes 0.9" \
                                      " --counts_key counts" \
                                      " --condition_key batch" \
                                      f" --cat_covariates_keys {cat_covariates_keys}" \
                                      f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                                      " --spatial_key spatial" \
                                      " --adj_key spatial_connectivities" \
                                      " --mapping_entity_key mapping_entity" \
                                      " --gp_targets_mask_key nichecompass_gp_targets" \
                                      " --gp_sources_mask_key nichecompass_gp_sources" \
                                      " --gp_names_key nichecompass_gp_names" \
                                      " --include_atac_modality" \
                                      " --filter_peaks" \
                                      " --min_cell_peak_thresh_ratio 0.01" \
                                      f" --model_label {node_label_method}_{task}" \
                                      " --active_gp_names_key nichecompass_active_gp_names" \
                                      " --latent_key nichecompass_latent" \
                                      f" --n_addon_gp {n_addon_gp}" \
                                      f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                                      " --gene_expr_recon_dist nb" \
                                      f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                                      f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                                      " --log_variational" \
                                      f" --node_label_method {node_label_method}" \
                                      " --n_layers_encoder 1" \
                                      " --n_hidden_encoder None" \
                                      f" --conv_layer_encoder {conv_layer_encoder}" \
                                      " --n_epochs 100" \
                                      " --n_epochs_all_gps 25" \
                                      " --n_epochs_no_cat_covariates_contrastive 0" \
                                      " --lr 0.001" \
                                      f" --lambda_edge_recon {lambda_edge_recon}" \
                                      f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                      f" --lambda_chrom_access_recon {lambda_chrom_access_recon}" \
                                      f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                                      f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                                      f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                                      " --lambda_group_lasso 0." \
                                      f" --lambda_l1_masked {lambda_l1_masked}" \
                                      f" --lambda_l1_addon {lambda_l1_addon}." \
                                      f" --edge_batch_size {edge_batch_size}" \
                                      " --node_batch_size None" \
                                      f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                      f" --timestamp_suffix _{job_id}"

                        submit_python_script(
                                job_name_prefix=job_name_prefix,
                                job_id=job_id,
                                job_folder_path=job_folder_path,
                                conda_env_name=conda_env_name,
                                script_folder_path=script_folder_path,
                                script_name=script_name,
                                script_args=script_args,
                                nice=10000)

                        job_id += 1

In [None]:
job_id = 2
for lambda_edge_recon, lambda_gene_expr_recon in zip([5000000], [3000]):
    for lambda_chrom_access_recon in [1000]:
        for active_gp_thresh_ratio in [0.03]:
            for lambda_l1_masked, lambda_l1_addon in zip([0], [0]):
                for n_svg in [3000]:
                    for n_svp in [0]:
                        task = "reference"
                        dataset = "spatial_atac_rna_seq_mouse_brain_batch2"
                        reference_batches = "None"
                        n_neighbors = 8
                        species = "mouse"
                        node_label_method = "one-hop-norm"
                        conv_layer_encoder = "gatv2conv"
                        edge_batch_size = "256"
                        n_sampled_neighbors = 8
                        lambda_cat_covariates_contrastive = 0.
                        contrastive_logits_pos_ratio = 0.
                        contrastive_logits_neg_ratio = 0.
                        cat_covariates_embeds_injection = "None"
                        cat_covariates_keys = "batch"
                        cat_covariates_no_edges = "True"
                        cat_covariates_embeds_nums = 0
                        n_hvg = 0
                        n_addon_gp = 100

                        job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
                        job_folder_path = f"../scripts/{task}/slurm_jobs"
                        script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
                        script_name = "train_nichecompass_reference_model.py"
                        script_args = f" --dataset {dataset}" \
                                      f" --reference_batches {reference_batches}" \
                                      f" --n_neighbors {n_neighbors}" \
                                      " --filter_genes" \
                                      f" --n_hvg {n_hvg}" \
                                      f" --n_svg {n_svg}" \
                                      f" --n_svp {n_svp}" \
                                      " --nichenet_keep_target_genes_ratio 1.0" \
                                      " --nichenet_max_n_target_genes_per_gp 250" \
                                      " --include_mebocost_gps" \
                                      " --include_collectri_gps" \
                                      " --include_brain_marker_gps" \
                                      f" --species {species}" \
                                      " --gp_filter_mode subset" \
                                      " --combine_overlap_gps" \
                                      " --overlap_thresh_source_genes 0.9" \
                                      " --overlap_thresh_target_genes 0.9" \
                                      " --overlap_thresh_genes 0.9" \
                                      " --counts_key counts" \
                                      " --condition_key batch" \
                                      f" --cat_covariates_keys {cat_covariates_keys}" \
                                      f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
                                      " --spatial_key spatial" \
                                      " --adj_key spatial_connectivities" \
                                      " --mapping_entity_key mapping_entity" \
                                      " --gp_targets_mask_key nichecompass_gp_targets" \
                                      " --gp_sources_mask_key nichecompass_gp_sources" \
                                      " --gp_names_key nichecompass_gp_names" \
                                      " --no-include_atac_modality" \
                                      " --filter_peaks" \
                                      " --min_cell_peak_thresh_ratio 0.01" \
                                      f" --model_label {node_label_method}_{task}" \
                                      " --active_gp_names_key nichecompass_active_gp_names" \
                                      " --latent_key nichecompass_latent" \
                                      f" --n_addon_gp {n_addon_gp}" \
                                      f" --active_gp_thresh_ratio {active_gp_thresh_ratio}" \
                                      " --gene_expr_recon_dist nb" \
                                      f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
                                      f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
                                      " --log_variational" \
                                      f" --node_label_method {node_label_method}" \
                                      " --n_layers_encoder 1" \
                                      " --n_hidden_encoder None" \
                                      f" --conv_layer_encoder {conv_layer_encoder}" \
                                      " --n_epochs 100" \
                                      " --n_epochs_all_gps 25" \
                                      " --n_epochs_no_cat_covariates_contrastive 0" \
                                      " --lr 0.001" \
                                      f" --lambda_edge_recon {lambda_edge_recon}" \
                                      f" --lambda_gene_expr_recon {lambda_gene_expr_recon}" \
                                      f" --lambda_chrom_access_recon {lambda_chrom_access_recon}" \
                                      f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
                                      f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
                                      f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
                                      " --lambda_group_lasso 0." \
                                      f" --lambda_l1_masked {lambda_l1_masked}" \
                                      f" --lambda_l1_addon {lambda_l1_addon}." \
                                      f" --edge_batch_size {edge_batch_size}" \
                                      " --node_batch_size None" \
                                      f" --n_sampled_neighbors {n_sampled_neighbors}" \
                                      f" --timestamp_suffix _{job_id}"

                        submit_python_script(
                                job_name_prefix=job_name_prefix,
                                job_id=job_id,
                                job_folder_path=job_folder_path,
                                conda_env_name=conda_env_name,
                                script_folder_path=script_folder_path,
                                script_name=script_name,
                                script_args=script_args,
                                nice=10000)

                        job_id += 1

### 5.6 Xenium Human Breast Cancer

In [None]:
task = "reference"
dataset = "xenium_human_breast_cancer"
job_id = 3
reference_batches = "batch1 batch2"
n_neighbors = 8 # 4, 8, 12
n_sampled_neighbors = 4
species = "human"
edge_batch_size = 512 # 4096 (4), 1024 (8, 12)
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "2"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.
lambda_l1_addon = 100.

job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.01" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method one-hop-norm" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"


submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

### 5.7 nanoString CosMx SMI Human Liver

In [33]:
task = "reference"
dataset = "nanostring_cosmx_human_liver"
job_id = 5
reference_batches = "batch1"
n_neighbors = 4
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 512 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "2"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 1000000.
contrastive_logits_neg_ratio = 0.0625
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --n_addon_gp 100" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              " --lambda_l1_addon 1000." \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

Submitted batch job 13297798


## 6. NicheCompass Reference Query Mapping

### 6.1 nanoString CosMx Human NSCLC

In [7]:
task = "reference_query"
dataset = "nanostring_cosmx_human_nsclc"
job_id = 8
query_batches = "batch4"
n_neighbors = 4
n_sampled_neighbors = 4
load_timestamp = "03092023_020519_8"  #
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.
edge_batch_size = 512

job_name_prefix = f"{dataset}_nichecompass_{task}_query_mapping"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --query_batches {query_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --spatial_key spatial" \
              " --mapping_entity_key mapping_entity" \
              " --gp_names_key nichecompass_gp_names" \
              f" --reference_model_label reference" \
              f" --load_timestamp {load_timestamp}" \
              f" --query_model_label query" \
              f" --reference_query_model_label reference_query_mapping" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args)

Submitted batch job 13309671


### 6.1 nanoString CosMx Human NSCLC Modified

In [None]:
task = "reference"
dataset = "nanostring_cosmx_human_nsclc_modified"
job_id = 2
reference_batches = "batch1 batch2 batch4 batch5 batch6 batch7 batch8"
n_neighbors = 4 # 4, 8, 12
n_sampled_neighbors = 4
species = "human"
node_label_method = "one-hop-norm"
edge_batch_size = 512 # 4096, 2048, 512
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch fov patient"
cat_covariates_no_edges = "True False True"
cat_covariates_embeds_nums = "3 30 5"
lambda_cat_covariates_contrastive = 1000000.
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 1.0" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {task}" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --n_addon_gp 10" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gatv2conv" \
              " --n_epochs 50" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              " --lambda_l1_addon 0." \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)

#### 6.1.2 Query Mapping

In [4]:
task = "reference_query"
dataset = "nanostring_cosmx_human_nsclc_modified"
job_id = 1
query_batches = "batch3"
n_neighbors = 4
n_sampled_neighbors = 4
load_timestamp = "01092023_182943_6"  #
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.
edge_batch_size = 512

job_name_prefix = f"{dataset}_nichecompass_{task}_query_mapping"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --query_batches {query_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --spatial_key spatial" \
              " --mapping_entity_key mapping_entity" \
              " --gp_names_key nichecompass_gp_names" \
              f" --reference_model_label reference" \
              f" --load_timestamp {load_timestamp}" \
              f" --query_model_label query" \
              f" --reference_query_model_label reference_query_mapping" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args)

Submitted batch job 13308468


In [6]:
task = "reference_query"
dataset = "nanostring_cosmx_human_nsclc_modified"
job_id = 3
query_batches = "batch3"
n_neighbors = 4
n_sampled_neighbors = 4
load_timestamp = "01092023_182316_1"  #
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.
edge_batch_size = 512

job_name_prefix = f"{dataset}_nichecompass_{task}_query_mapping"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --query_batches {query_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --spatial_key spatial" \
              " --mapping_entity_key mapping_entity" \
              " --gp_names_key nichecompass_gp_names" \
              f" --reference_model_label reference" \
              f" --load_timestamp {load_timestamp}" \
              f" --query_model_label query" \
              f" --reference_query_model_label reference_query_mapping" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args)

Submitted batch job 13308470


### 6.1 nanoString CosMx Human Liver

In [9]:
task = "reference_query"
dataset = "nanostring_cosmx_human_liver"
job_id = 3
query_batches = "batch2"
n_neighbors = 4
n_sampled_neighbors = 4
load_timestamp = "01092023_182427_3"  #
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.
edge_batch_size = 512

job_name_prefix = f"{dataset}_nichecompass_{task}_query_mapping"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --query_batches {query_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --spatial_key spatial" \
              " --mapping_entity_key mapping_entity" \
              " --gp_names_key nichecompass_gp_names" \
              f" --reference_model_label reference" \
              f" --load_timestamp {load_timestamp}" \
              f" --query_model_label query" \
              f" --reference_query_model_label reference_query_mapping" \
              " --n_epochs 400" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 5000000." \
              " --lambda_gene_expr_recon 3000." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args)

Submitted batch job 13308480


### 6.1 seqFISH Mouse Organogenesis Imputed

#### 6.1.1 Reference Model Training

In [None]:
task = "reference_query"
dataset = "seqfish_mouse_organogenesis_imputed"
job_id = 1
reference_batches = "batch1 batch2 batch3 batch4"
n_neighbors = 12
n_hvg = 3000
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 4096
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 5.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}_reference_only"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --filter_genes" \
              f" --n_hvg {n_hvg}" \
              " --nichenet_keep_target_genes_ratio 0.1" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              " --condition_key batch" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}_reference_only" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0.05" \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        t="12:00:00",
        p="interactive_gpu_p",
        gres="gpu:1",
        qos="interactive_gpu",
        nice=9999)

#### 6.1.2 Query Mapping

In [None]:
task = "reference_query"
dataset = "seqfish_mouse_organogenesis_imputed"
job_id = 1
query_batches = "batch5 batch6"
n_neighbors = 12
node_label_method = "one-hop-norm"
load_timestamp = "01072023_165203_1"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 5.
edge_batch_size = 4096

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}_query_mapping"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --query_batches {query_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --spatial_key spatial" \
              " --mapping_entity_key mapping_entity" \
              " --gp_names_key nichecompass_gp_names" \
              f" --reference_model_label {node_label_method}_{task}_reference_only" \
              f" --load_timestamp {load_timestamp}" \
              f" --query_model_label {node_label_method}_{task}_query_only" \
              f" --reference_query_model_label {node_label_method}_{task}_query_mapping" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        t="12:00:00",
        p="interactive_gpu_p",
        gres="gpu:1",
        qos="interactive_gpu",
        nice=9999)

### 6.2 STARmap PLUS Mouse Central Nervous System

#### 6.2.1 Reference Model Training

In [None]:
task = "reference_query"
dataset = "starmap_plus_mouse_cns"
job_id = 4
reference_batches = "batch1 batch2"
n_neighbors = 8 # 8, 12
n_sampled_neighbors = 4
species = "mouse"
node_label_method = "one-hop-norm"
edge_batch_size = 1024
cat_covariates_embeds_injection = "gene_expr_decoder"
cat_covariates_keys = "batch"
cat_covariates_no_edges = "True"
cat_covariates_embeds_nums = "3"
lambda_cat_covariates_contrastive = 250000. # 0., 10000, 100000, 500000
contrastive_logits_pos_ratio = 0.0625 # 0., 0.125, 0.0625, 0.03125
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 0.

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --reference_batches {reference_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --no-filter_genes" \
              " --nichenet_keep_target_genes_ratio 0.1" \
              " --nichenet_max_n_target_genes_per_gp 250" \
              " --include_mebocost_gps" \
              f" --species {species}" \
              " --gp_filter_mode subset" \
              " --combine_overlap_gps" \
              " --overlap_thresh_source_genes 0.9" \
              " --overlap_thresh_target_genes 0.9" \
              " --overlap_thresh_genes 0.9" \
              " --counts_key counts" \
              f" --cat_covariates_keys {cat_covariates_keys}" \
              f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
              " --spatial_key spatial" \
              " --adj_key spatial_connectivities" \
              " --mapping_entity_key mapping_entity" \
              " --gp_targets_mask_key nichecompass_gp_targets" \
              " --gp_sources_mask_key nichecompass_gp_sources" \
              " --gp_names_key nichecompass_gp_names" \
              f" --model_label {node_label_method}_{task}_reference_only" \
              " --active_gp_names_key nichecompass_active_gp_names" \
              " --latent_key nichecompass_latent" \
              " --active_gp_thresh_ratio 0." \
              " --gene_expr_recon_dist nb" \
              f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
              f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
              " --log_variational" \
              f" --node_label_method {node_label_method}" \
              " --n_layers_encoder 1" \
              " --n_hidden_encoder None" \
              " --conv_layer_encoder gcnconv" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \
              f" --n_sampled_neighbors {n_sampled_neighbors}" \
              f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=9999)

#### 6.2.2 Query Mapping

In [None]:
task = "reference_query"
dataset = "starmap_plus_mouse_cns"
job_id = 1
query_batches = "batch3"
n_neighbors = 12
node_label_method = "one-hop-norm"
load_timestamp = "06072023_114143_1"
lambda_cat_covariates_contrastive = 0.
contrastive_logits_pos_ratio = 0.
contrastive_logits_neg_ratio = 0.
lambda_l1_masked = 5.
edge_batch_size = 1024

job_name_prefix = f"{dataset}_nichecompass_{node_label_method}_{task}_query_mapping"
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "map_query_on_nichecompass_reference_model.py"
script_args = f" --dataset {dataset}" \
              f" --query_batches {query_batches}" \
              f" --n_neighbors {n_neighbors}" \
              " --spatial_key spatial" \
              " --mapping_entity_key mapping_entity" \
              " --gp_names_key nichecompass_gp_names" \
              f" --reference_model_label {node_label_method}_{task}_reference_only" \
              f" --load_timestamp {load_timestamp}" \
              f" --query_model_label {node_label_method}_{task}_query_only" \
              f" --reference_query_model_label {node_label_method}_{task}_query_mapping" \
              " --n_epochs 100" \
              " --n_epochs_all_gps 25" \
              " --n_epochs_no_cat_covariates_contrastive 0" \
              " --lr 0.001" \
              " --lambda_edge_recon 500000." \
              " --lambda_gene_expr_recon 300." \
              f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
              f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
              f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
              " --lambda_group_lasso 0." \
              f" --lambda_l1_masked {lambda_l1_masked}" \
              f" --edge_batch_size {edge_batch_size}" \
              " --node_batch_size None" \

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        t="12:00:00",
        p="interactive_gpu_p",
        gres="gpu:1",
        qos="interactive_gpu",
        nice=9999)

## 7. Extra

### 7.1 Visium Mouse Brain

In [None]:
# GATv2 encoder
task = "sample_integration_method_benchmarking"
dataset = "visium_mouse_brain"
reference_batches = "batch1 batch2"
cell_type_key = "cell_type"
species = "mouse"
edge_batch_size_str = "512" # out of memory 
cat_covariates_embeds_injection = "gene_expr_decoder" # "encoder gene_expr_decoder"
cat_covariates_keys = "batch data"
cat_covariates_no_edges = "True False"
cat_covariates_embeds_nums = "2 2"
lambda_cat_covariates_contrastive = 1000000
contrastive_logits_pos_ratio = 0.0625
contrastive_logits_neg_ratio = 0.
conv_layer_encoder = "gatv2conv"

job_name_prefix = f"{dataset}_nichecompass_{conv_layer_encoder}_{task}"
job_id = 4
job_folder_path = f"../scripts/{task}/slurm_jobs"
script_folder_path = f"/home/aih/sebastian.birk/workspace/projects/nichecompass-reproducibility/scripts/{task}"
script_name = "train_nichecompass_benchmarking_models.py"
script_args =  " --adata_new_name None" \
               " --n_neighbors_list 6" \
               f" --edge_batch_size_list {edge_batch_size_str}" \
               " --node_batch_size_list None" \
               " --seeds 1" \
               " --run_index 2" \
               f" --cell_type_key {cell_type_key}" \
               " --filter_genes" \
               f" --n_svg 5000" \
               " --nichenet_keep_target_genes_ratio 1.0" \
               " --nichenet_max_n_target_genes_per_gp 250" \
               " --include_mebocost_gps" \
               f" --species {species}" \
               " --gp_filter_mode subset" \
               " --combine_overlap_gps" \
               " --overlap_thresh_source_genes 0.9" \
               " --overlap_thresh_target_genes 0.9" \
               " --overlap_thresh_genes 0.9" \
               f" --dataset {dataset}" \
               f" --reference_batches {reference_batches}" \
               " --counts_key counts" \
               f" --cat_covariates_keys {cat_covariates_keys}" \
               f" --cat_covariates_no_edges {cat_covariates_no_edges}" \
               " --spatial_key spatial" \
               " --adj_key spatial_connectivities" \
               " --gp_targets_mask_key nichecompass_gp_targets" \
               " --gp_sources_mask_key nichecompass_gp_sources" \
               " --gp_names_key nichecompass_gp_names" \
               f" --model_label {conv_layer_encoder}_{task}" \
               " --active_gp_names_key nichecompass_active_gp_names" \
               " --latent_key nichecompass_latent" \
               " --n_addon_gp 10" \
               " --active_gp_thresh_ratio 0." \
               " --gene_expr_recon_dist nb" \
               f" --cat_covariates_embeds_injection {cat_covariates_embeds_injection}" \
               f" --cat_covariates_embeds_nums {cat_covariates_embeds_nums}" \
               " --log_variational" \
               " --node_label_method one-hop-norm" \
               " --n_layers_encoder 1" \
               " --n_fc_layers_encoder 2" \
               " --n_hidden_encoder None" \
               f" --conv_layer_encoder {conv_layer_encoder}" \
               " --n_epochs 400" \
               " --n_epochs_all_gps 25" \
               " --n_epochs_no_cat_covariates_contrastive 0" \
               " --lr 0.001" \
               " --lambda_edge_recon 5000000." \
               " --lambda_gene_expr_recon 3000." \
               f" --lambda_cat_covariates_contrastive {lambda_cat_covariates_contrastive}" \
               f" --contrastive_logits_pos_ratio {contrastive_logits_pos_ratio}" \
               f" --contrastive_logits_neg_ratio {contrastive_logits_neg_ratio}" \
               " --lambda_group_lasso 0." \
               f" --lambda_l1_masked 0." \
               " --lambda_l1_addon 0." \
               f" --n_sampled_neighbors 6" \
               f" --timestamp_suffix _{job_id}"

submit_python_script(
        job_name_prefix=job_name_prefix,
        job_id=job_id,
        job_folder_path=job_folder_path,
        conda_env_name=conda_env_name,
        script_folder_path=script_folder_path,
        script_name=script_name,
        script_args=script_args,
        nice=10000)