# Simulation code for Root Cause Discovery

We perform 3 simulations: the first two is in low-dimensional setting, while the last mimicks our real data analysis (high-dimensional setting). All simulation uses the following model

+ X <- mu_x+ BX +error + delta, 
+ X = (I-B)^{-1}* (mu_x + error + delta), 

with different kinds of B.

In [8]:
import os
import subprocess
from math import ceil

def submit(command, ncores, total_mem, joblog_dir, jobname="submit"):
    mem = ceil(total_mem / ncores)  # memory per core
    filename = f"{jobname}.sh"
    with open(filename, "w") as io:
        io.write("#!/bin/bash\n")
        io.write("#\n")
        io.write(f"#SBATCH --job-name={jobname}\n")
        io.write("#\n")
        io.write("#SBATCH --time=48:00:00\n")
        io.write(f"#SBATCH --cpus-per-task={ncores}\n")
        io.write(f"#SBATCH --mem-per-cpu={mem}G\n")
        io.write("#SBATCH --partition=candes,owners\n")
        io.write(f"#SBATCH --output={os.path.join(joblog_dir, 'slurm-%j.out')}\n")
        io.write("\n")
        io.write("#save job info on joblog:\n")
        io.write("echo \"Job $JOB_ID started on:   \" `hostname -s`\n")
        io.write("echo \"Job $JOB_ID started on:   \" `date `\n")
        io.write("\n")
        io.write("# load the job environment:\n")
        io.write("module load python/3.9\n")
        io.write("\n")
        io.write("# run code\n")
        io.write(f"echo \"{command}\"\n")
        io.write(f"{command}\n")
        io.write("\n")
        io.write("#echo job info on joblog:\n")
        io.write("echo \"Job $JOB_ID ended on:   \" `hostname -s`\n")
        io.write("echo \"Job $JOB_ID ended on:   \" `date `\n")
        io.write("#echo \" \"\n")
    
    # submit job
    subprocess.run(["sbatch", filename])
    # clean up
    os.remove(filename)
    return None

## Simulation 1

In [None]:
# put in file sim1.py
# usage: python3 sim1.py arg1 arg2 arg3 arg4
from ../python/root_cause_discovery_funcs import *
from ../python/simulation_setting_func import *
import numpy as np
import pandas as pd
import warnings  # ignore the warnings
from collections import Counter
import sys
import os

# parameters for simulation
s_B = float(sys.argv[1])      # 0.2, 0.4, or 0.6
int_mean = int(sys.argv[2])   # 10, 15, or 20
replicate = int(sys.argv[3])  # 1 to 10
outdir = sys.argv[4]          # output directory
if not os.path.isdir(outdir):
    os.mkdir(outdir)

# other fixed parameters
p = 100
n = 200
m = 100
int_sd = 1
nshuffles = 10
B_value_min = -1
B_value_max = 1
err_min = 1
err_max = 5
var_X_min = 10
var_X_max = 50
sim_total = 10
dag_type = "random"

# generate data
np.random.seed(replicate)
B, sigma2_error, b = generate_setting(dag_type, s_B, B_value_min, B_value_max, err_min, err_max, var_X_min, var_X_max, p)

# try 10 simulations within 1 replicate
for sim in range(sim_total):
    np.random.seed(sim)
    X_obs, X_int_all, RC = generate_data(n, m, p, B, sigma2_error, b, int_mean, int_sd)

    # z score method
    Zscores = zscore(X_obs, X_int_all)

    # cholesky score method
    RC_scores_chol = np.zeros((X_int_all.shape))
    for i, X_int in enumerate(X_int_all):
        # set thresholds based on Zscore[i]
        thresholds = np.arange(0.1, min([5,max(Zscores[i,:])-0.0001]), 0.2)
        RC_scores_chol[i,:] = root_cause_discovery_main(X_obs, X_int, thresholds, nshuffles=nshuffles, verbose=False)

    # save simulation result
    outfile = os.path.join(outdir, "sim" + str(sim) + '.npz')
    np.savez(outfile, array1=RC, array2=Zscores, array3=RC_scores_chol)
    # to load
    # result = np.load("sim0.npz")
    # result["array1"]

print('Done!')

Submit jobs

In [9]:
python_exe = "/home/users/bbchu/RootCauseDiscovery/simulations/sim1.py"
s_B = 0.2
int_mean = 10
replicate = 1
outdir = "/home/users/bbchu/RootCauseDiscovery/simulations/replicate1"
cmd = f"python3 {python_exe} {s_B} {int_mean} {replicate} {outdir}"

# 1 core, 12 GB
joblog_dir = "/home/users/bbchu/RootCauseDiscovery/simulations/joblogs"
jobname = f"rep{replicate}"
submit(cmd, 1, 12, joblog_dir, jobname)

Submitted batch job 47370703
