In [1]:
import import_ipynb
from base.datasets import load_synthetic_path, load_data, load_coeffs, \
                          sythetic_sanity_check, data_convert
from base.train_model_syn import train_model, test_model

import os
import time
import pandas as pd
import random
import torch

importing Jupyter notebook from /scratch/Eugene/codes/grt_hist/StatEcoNet/StatEcoNet/base/datasets.ipynb
importing Jupyter notebook from /scratch/Eugene/codes/grt_hist/StatEcoNet/StatEcoNet/base/train_model_syn.ipynb
importing Jupyter notebook from /scratch/Eugene/codes/grt_hist/StatEcoNet/StatEcoNet/base/models.ipynb


# Define experiment setup

In [2]:
# Define the dataset
nSites_list = [1000] # 100, 1000
nVisits_list = [10] # 3, 10
rho_list = [1] # 0, 1

# Define the model
model_list = [2] # 0:OD-LR, 1:OD-1NN, 2:StatEcoNet

In [3]:
# Define other parameters
max_iter = 2000
repeat = 1
save_flag = True

In [4]:
# Optimal hyper-parameters
opts = pd.read_csv("opt/opt_syn_NN.csv")

# Set the save path
if not os.path.isdir("output/"):
    os.mkdir("output/")
if not os.path.isdir("output/prediction/"):
    os.mkdir("output/prediction/")
save_base_path = "output/prediction/synthetic/"
if not os.path.isdir(save_base_path):
    os.mkdir(save_base_path)
test_file = "output/NN_syn_test.csv"

# Train/evalute the model

In [5]:
NN_record = pd.DataFrame(columns=['nSites','nVisits','rho','model.id','trial',\
                                  'best.iter','opt.learning.time',\
                                  'test.auroc','test.auprc',\
                                  'test.occ.corr','test.det.corr'])

i = 0
start_init = time.time()
for nSites in nSites_list:
    for nVisits in nVisits_list:
        for rho in rho_list:                       
            # Load data
            data_size = str(nSites)+"x"+str(nVisits)
            dir_path, brt_path, coeff_path = load_synthetic_path(data_size, rho)

            x_dim, w_dim, k, \
            train_occCovars, train_detCovars, \
            train_occProbs, train_detProbs, train_Y,\
            valid_occCovars, valid_detCovars, \
            valid_occProbs, valid_detProbs, valid_Y,\
            test_occCovars, test_detCovars, \
            test_occProbs, test_detProbs, test_Y = load_data(dir_path)

            occCoeffs, detCoeffs = load_coeffs(coeff_path)
            sythetic_sanity_check(rho, x_dim, w_dim, occCoeffs, detCoeffs, \
                                  train_occCovars, train_detCovars, \
                                  train_occProbs, train_detProbs)

            x_train, w_train, y_train, \
            x_valid, w_valid, y_valid, \
            x_test, w_test, y_test = data_convert( \
                            train_occCovars, train_detCovars, train_Y, \
                            valid_occCovars, valid_detCovars, valid_Y, \
                            test_occCovars, test_detCovars, test_Y)   

            for model_id in model_list:
                print("nSites", nSites, "nVisits", nVisits, "rho", rho, \
                      "model", model_id)
                opt = opts[(opts.nSites==nSites) & (opts.nVisits==nVisits) & \
                           (opts.rho==rho) & (opts.model_id==model_id)]

                lr = opt['lr'].item()
                bs = int(opt['batch_size'].item())
                nN = int(opt['nNeurons'].item())
                nL = int(opt['nLayers'].item())
                mixed = opt['mixed_weight'].item()

                for trial in list(range(1, repeat+1)):
                    my_seed = random.randint(1, 1000)
                    print("(", trial, "/", repeat, ")")

                    # Find the optimal iteration
                    df_train, df_valid, \
                    best_iter, best_model, opt_learing_time = \
                    train_model(my_seed, model_id, nL, nN, lr, "train", \
                                x_dim, w_dim, k, max_iter, mixed, \
                                x_train, w_train, y_train,\
                                x_valid, w_valid, y_valid, \
                                train_occProbs, train_detProbs, \
                                valid_occProbs, valid_detProbs, bs)                  
                    df_valid['dataset'] = 'valid'

                    # Test the trained model
                    psi_hat, p_hat, y_hat, auroc, auprc, occCorr, detCorr = \
                        test_model(best_model, x_test, w_test, y_test, \
                                   test_occProbs, test_detProbs, k)

                    # Save prediction outcomes
                    if save_flag:
                        save_path = save_base_path + data_size + 'x' + \
                                    str(rho) + "_m" + str(model_id) + \
                                    "_w" + str(mixed) + "_t" + str(trial)
                        print("The model predictions are saved at", save_path)

                        pd.DataFrame(psi_hat.numpy().flatten()).to_csv( \
                                     save_path+"_psi_hat.csv", header=False, \
                                                               index=False)
                        pd.DataFrame(p_hat.numpy().flatten()).to_csv( \
                                     save_path+"_p_hat.csv", header=False, \
                                                             index=False)
                        pd.DataFrame(y_hat).to_csv( \
                                     save_path+"_y_hat.csv", header=False, \
                                                             index=False)                       
                        torch.save(best_model.state_dict(), \
                                   save_path + '_final_model.pt')                

                        record = [nSites, nVisits, rho, model_id, trial, \
                                  best_iter, opt_learing_time]                    
                        record.extend([auroc, auprc, occCorr, detCorr])
                        NN_record.loc[i] = record       
                        NN_record.to_csv(test_file, index=False)

                    i = i + 1

print("Test evaluation metrics are saved at", test_file)
end = time.time()
elapse = end - start_init
print(elapse)

data path: ../data/Synthetic/1000x10/rho1/
nSites 1000 nVisits 10 rho 1 model 2
( 1 / 1 )


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))

No more improvement. This is the early stop point.

The model predictions are saved at output/prediction/synthetic/1000x10x1_m2_w0.01_t1
Test evaluation metrics are saved at output/NN_syn_test.csv
55.063953161239624
