# Gaussian process with Spike and Slab ARD prior - Stage 1 Savitsky experiment
## Zero-temperature-MF VI algorithm (continuous spike) - non additive kernels

### (1) Importing and defining all required functions

In [1]:
"""
Importing required libraries and defining key functions
"""
# Requirements for algorithms
import numpy as np
import scipy.special as sp
from scipy.spatial.distance import cdist 

# Requirements For plots and diagnostics
import matplotlib.pyplot as plt
import scipy.stats as sps
import time
from sklearn.metrics import roc_auc_score
from datetime import date
import inspect
import pickle
from IPython.display import clear_output

# Algorithm functions
import os
os.chdir('C:/Users/hughw/Documents/MSC project/GP algorithms/Master function files')
from GP_funcs_ZTMFSS import kernel_funcs
from GP_funcs_ZTMFSS import model_funcs
from GP_funcs_ZTMFSS import draw_GP
from GP_funcs_ZTMFSS import fit
from GP_funcs_ZTMFSS import diagnostics
from GP_funcs_ZTMFSS import simulations
from functools import partial
os.chdir('C:/Users/hughw/Documents/MSC project/Simulation results')

### (2) Setting simulation parameters and models

In [8]:
"""
Simulation controls
"""
# Simulation settings
n=100
ntest=20
p=1000
q=6
correlation = False
nruns = 100

# Model run settings
l_init = 0.01
beta2=0.99
m = 7
nmodels = 7
VS_threshs = [[0.1**2,0.1**1.5,0.1**1,0.1**0.5,0.1**0],
              [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99]]
iter_remove = [False,True,True,True,True,True,True]
sampling_strat = ["unif","unif", "unif", "unif", "unif", "unif", "unif"]
minibatch_size = [n,25, 25,50,50,n,n]
GPtol=[1e-10, 1e-5,1e-5,1e-5,1e-5,1e-5,1e-5]
SS_GP=[False, True,True,True,True,True,True]
MC_pred = [False,False,False,False,False,False,False]
predict_selected = [False,False,False,False,False,False,False]
post_fit = [False,False,False,False,False,False,False]
train = [True,True,False,True, False, True,False] 
step = [0.01,0.05,0.05,0.05,0.05,0.05,0.05]
hyper_opt = [False,True,True,True,True,True,True]
model_select = [True,False,True,False,True,False,True]
post_var = [False,True,True,True,True,True,True]
model_weights = ["","elpd", "elpd","elpd", "elpd","elpd", "elpd"]
min_VBEM_iter = [1,3,3,3,3,3,3]
max_VBEM_iter = [1,10,10,10,10,10,10]
gp_iter = [500,100,100,100,100,100,100]
opt = ["adam", "amsgrad", "amsgrad", "amsgrad", "amsgrad", "amsgrad", "amsgrad"]

t = len(VS_threshs[0])
kern=kernel_funcs.gaussian
grad_kern=kernel_funcs.grad_gaussian
NN_pred = False
newsumgrads = False
VBtol=0.1/p
temp=1
# SEE BELOW FOR ARGVALS

# Storage objects
Runtime=np.zeros((nruns, m))
Lambda = np.zeros((nruns, m, p))
L = np.zeros((nruns, m, p))
L1norm=np.zeros((nruns, m))
L2norm=np.zeros((nruns, m))
MSE_F=np.zeros((nruns, m))
MSE_Y=np.zeros((nruns,m))
Acc=np.zeros((nruns,m,t))
Weighted_Acc=np.zeros((nruns,m,t))
TPR=np.zeros((nruns,m,t))
TNR=np.zeros((nruns,m,t))
PPV=np.zeros((nruns,m,t))
NPV=np.zeros((nruns,m,t))
MCC=np.zeros((nruns,m,t))
AUC=np.zeros((nruns,m))

### (3) Running algorithm iterations, saving and displaying results

In [9]:
np.random.seed(8750)
runlist = np.random.choice(1000,100,False) # Choose 100 random trials
for run in range(len(runlist)):
    
    """
    Generating data and scaling data
    """
    lselect=[]
    np.random.seed(runlist[run]) # Fixing trial seed
    t=time.time()
    Y,F,X,e,sigma,select=draw_GP.draw_parametric_savitsky(n,ntest,p,q, correlation)
    
    Y = Y.reshape(n+ntest,1)
    F = F.reshape(n+ntest,1)
    
    #Y = (Y-Y.mean())/Y.var()**0.5
    #F = (F-F.mean())/F.var()**0.5
    X = (X-X.mean(0))/X.var(0)**0.5

    # Getting training and test set
    ytest=Y[n:]
    Xtest=X[n:]
    ftest=F[n:]
    y=Y[:n]
    X=X[:n]
    f=F[:n]
    print("data generated")
    if lselect:
        print("Length-scales are: ",lselect[select])
    print("Noise variance is: ",sigma**2)
    print("Average data variance is: ", np.mean(np.var(X,0)))
    print(time.time()-t)
    
    """
    Running algorithm
    """
    args=[]
    arg_vals =[]
    for i in range(nmodels):
        args.append(["seed", "subsample", "Beta2", "ELBO_sample", "learn_rate", "ltrue", "learn_spike", "min_VBEM_iter", "init_GP_iter", "max_VBEM_iter", "GP_fit_tol", "VBEM_tol", "print_VBEM",
             "s0", "sig0", "newsumgrads","temp","v0","v1", "max_GP_fit_iter", "iter_remove", "learn_rate_mult", "sampling_strat", "final_prune"])
        arg_vals.append([1, minibatch_size[i], beta2,  min(1000,n), step[i], [], False, min_VBEM_iter[i],gp_iter[i] , max_VBEM_iter[i], GPtol[i], VBtol, False, 
                 np.var(y), np.var(y)**0.5, newsumgrads,temp,1e+4,1e-4, gp_iter[i], iter_remove[i], 1, "unif",False])

    testing_algorithm = partial(diagnostics.get_pred_posterior_GP,reg=0.01,kern = kernel_funcs.gaussian)

    Runtime[run], Lambda[run], L[run], L1norm[run], L2norm[run], MSE_F[run], MSE_Y[run], Acc[run], Weighted_Acc[run], TPR[run], TNR[run], PPV[run], NPV[run], AUC[run], MCC[run] = simulations.do_simulation_VBEMSSGP(
                               y, X, ftest, ytest, Xtest, q, algorithm_training = fit.VB_EM_GP_SS, algorithm_testing = testing_algorithm, post_var = post_var,
                               nmodels = m, args = args, arg_vals = arg_vals, SS_GP = SS_GP, hyper_opt = hyper_opt, train = train,
                                hyper_arg = ["v0","v1"], hyper_vals = [1e+4*2**np.linspace(np.log2(100),-np.log2(100),11),2**np.linspace(np.log2(100),-np.log2(100),11)], order_relevant_vars = False, order_irrelevant_vars = False, 
                                VS_threshs = VS_threshs, select = select, predict_selected = predict_selected, ltrue=lselect, MC_pred = MC_pred, post_fit = post_fit, model_select = model_select,
                                model_weighting = model_weights)
    
    print("RUN {0}".format(run))
    print("Runtime mean is:", Runtime[:run+1].mean(0))
    print("Weighted accuracy mean is:", Weighted_Acc[:run].mean(0))
    print("TPR mean is:", TPR[:run+1].mean(0))
    print("PPV mean is:", PPV[:run+1].mean(0))
    print("MCC mean is:", MCC[:run+1].mean(0))
    print("L1norm mean is:", L1norm[:run+1].mean(0))
    print("L2norm mean is:", L2norm[:run+1].mean(0))
    print("MSE_F mean is:", MSE_F[:run+1].mean(0))
    print("MSE_Y mean is:", MSE_Y[:run+1].mean(0), "\n")
    
    print("Runtime is:", Runtime[run])
    print("Weighted accuracy is:", Weighted_Acc[run])
    print("TPR is:", TPR[run])
    print("PPV is:", PPV[run])
    print("MCC is:", MCC[run])
    print("L1norm is:", L1norm[run])
    print("L2norm is:", L2norm[run])
    print("MSE_F is:", MSE_F[run])
    print("MSE_Y is:", MSE_Y[run], "\n")
    

data generated
Noise variance is:  0.0025000000000000005
Average data variance is:  0.9969892827714567
0.002991199493408203
run time is : 29.81582474708557


  return kern(cdist(X,X, metric = "seuclidean", V = 1/l**2),s)
  return kern(cdist(X,Xtest, metric = "seuclidean", V = 1/l**2),s).T
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  MCC[j,i]=(TP*TN-FP*FN)/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))


run time is : 18.263569831848145
run time is : 18.751848459243774
run time is : 11.052721738815308
run time is : 13.156780242919922
run time is : 11.400537729263306


KeyboardInterrupt: 

In [24]:
namelist = ["Runtime", "MSE_F", "MSE_Y", "Acc", "Weighted_Acc", "TPR", "TNR", "PPV", "NPV", "AUC", "MCC"]
objlist = [Runtime, MSE_F, MSE_Y, Acc, Weighted_Acc, TPR, TNR, PPV, NPV, AUC, MCC]
#iters = np.random.choice(1000,100,False)
iters = np.linspace(0,99,100).astype(int)

for i in range(len(objlist)):
    print("Mean {0} is:".format(namelist[i]), np.mean(objlist[i][iters],0))

print("\n")
for i in range(len(objlist)):
    print("Median {0} is:".format(namelist[i]), np.median(objlist[i][iters],0))

print("\n")
quant = 0.25
for i in range(len(objlist)):
    if namelist[i] in ["Runtime", "MSE_F", "MSE_Y"]:
        print("{1} quantile {0} is:".format(namelist[i], quant), np.quantile(objlist[i][iters],1-quant,0))
    else:
        print("{1} quantile {0} is:".format(namelist[i], quant), np.quantile(objlist[i][iters],quant,0))

Mean Runtime is: [31.81874273]
Mean MSE_F is: [0.13405923]
Mean MSE_Y is: [0.13595038]
Mean Acc is: [[0.98541 0.9982  0.99545 0.9949  0.994  ]]
Mean Weighted_Acc is: [[0.94710429 0.95270959 0.62083333 0.575      0.5       ]]
Mean TPR is: [[0.90833333 0.90666667 0.24166667 0.15       0.        ]]
Mean TNR is: [[0.98587525 0.99875252 1.         1.         1.        ]]
Mean PPV is: [[0.53157822 0.84333514 1.                nan        nan]]
Mean NPV is: [[0.99943302 0.99943593 0.99544364 0.9948955  0.994     ]]
Mean AUC is: [0.94689805]
Mean MCC is: [[0.6447404  0.87098273 0.48326039 0.36650283 0.        ]]


Median Runtime is: [31.89494574]
Median MSE_F is: [0.1198013]
Median MSE_Y is: [0.11660882]
Median Acc is: [[0.978 0.998 0.995 0.995 0.994]]
Median Weighted_Acc is: [[0.90610329 0.91616365 0.58333333 0.58333333 0.5       ]]
Median TPR is: [[0.83333333 0.83333333 0.16666667 0.16666667 0.        ]]
Median TNR is: [[0.97887324 0.99899396 1.         1.         1.        ]]
Median PPV is: 

In [25]:
Output = {"Runtime" : Runtime, "Lambda" : Lambda, "L" : L, "L1norm" : L1norm, "L2norm" : L2norm, "MSE_F" : MSE_F
        , "MSE_Y" : MSE_Y, "Acc" : Acc, "Weighted_Acc" : Weighted_Acc, "TPR" :TPR, "TNR" : TNR, "PPV" : PPV, "NPV" : NPV, "AUC" : AUC, "MCC" : MCC}
String = "Stage1_Savitsky_ZT_{0}_l0={1}_b2={2}_newgrads={3}_predselect={4}_MCpred={5}_hyperopt={11}_minibatch={12}_n={6}_p={7}_q={8}_kern={9}_runs={10}".format(
        date.today(), l_init, beta2, newsumgrads, predict_selected[1],MC_pred[1],n,p,q,str(kern)[23:28], nruns, hyper_opt[1],minibatch_size[1])
np.save(String, Output) # saving