# Gaussian process with Spike and Slab ARD prior - Stage 1 GP experiment
## Zero-temperature-pruning VI algorithm (continuous spike) - non additive kernels

### (1) Importing and defining all required functions

In [27]:
"""
Importing in libraries for SGD-SS-GP
"""
# Requirements for algorithms
import numpy as np
import scipy.special as sp
from functools import partial
from scipy.spatial.distance import cdist

# Requirements for simulations/monitoring algorithms
import time
from IPython.display import clear_output
import inspect
from sklearn.metrics import roc_auc_score

# Requirements For plots and diagnostics
import matplotlib.pyplot as plt
import scipy.stats as sps
from mpl_toolkits import mplot3d

# Make plots inline
%matplotlib inline

"""
Importing algorithm functions
"""
import os
os.chdir('C:/Users/hughw/Documents/MSC project/GP algorithms/Master function files')
from GP_funcs_ZTMFSS import kernel_funcs
from GP_funcs_ZTMFSS import model_funcs
from GP_funcs_ZTMFSS import draw_GP
from GP_funcs_ZTMFSS import fit
from GP_funcs_ZTMFSS import diagnostics
from GP_funcs_ZTMFSS import simulations
from functools import partial
os.chdir('C:/Users/hughw/Documents/MSC project/Simulation results')

### (2) Setting simulation parameters and models

In [49]:
"""
Simulation controls
"""
# Simulation settings
n=300
ntest=100
p=100
q=5
corr=0.5 # used for GP draw
r2=2/3 # used for GP draw
lsmean=0.25
sigma2=1
ltrue=np.ones(p)*lsmean/q**0.5
strue=1

nruns = 100


# Model run settings
nmodels = 1
VS_threshs = [[0.01,0.05,0.1,0.2],
              [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99]]
iter_remove = [False, True,True,True,True, True, True]
sampling_strat = ["unif","unif", "unif", "unif", "unif", "unif", "unif"]
minibatch_size = [n,75, 75,150,150,n,n]
GPtol=[1e-6, 1e-5,1e-5,1e-5,1e-5,1e-5,1e-5]
SS_GP=[False, True,True,True,True,True,True]
MC_pred = [False,False,False,False,False,False,False]
predict_selected = [False,False,False,False,False,False,False]
post_fit = [False,False,False,False,False,False,False]
train = [True,True,False,True, False, True,False] 
step = [0.01,0.025,0.025,0.025,0.025,0.0025,0.0025]
hyper_opt = [False,True,True,True,True,True,True]
model_select = [True,False,True,False,True,False,True]
post_var = [False,True,True,True,True,True,True]
model_weights = ["","elpd", "elpd","elpd", "elpd","elpd", "elpd"]
min_VBEM_iter = [1,5,5,5,5,5,5]
max_VBEM_iter = [1,10,10,10,10,10,10]
gp_iter = [500,100,100,100,100,100,100]
X_mult = [1,1,1,1,1,10,10]
opt = ["adam", "amsgrad", "amsgrad", "amsgrad", "amsgrad", "amsgrad", "amsgrad"]
l_init = 0.01
beta2=0.99
t = len(VS_threshs[0])
kern=kernel_funcs.gaussian
grad_kern=kernel_funcs.grad_gaussian
reg = 0.01
NN_pred = False
newsumgrads = False
VBtol=0.1/p
temp=1
v0=1e+4
v1=1e-4
postfit = "mixed"
# SEE BELOW FOR ARGVALS

# Storage objects
m=nmodels
Runtime=np.zeros((nruns, m))
Lambda = np.zeros((nruns, m, p))
L = np.zeros((nruns, m, p))
L1norm=np.zeros((nruns, m))
L2norm=np.zeros((nruns, m))
MSE_F=np.zeros((nruns, m))
MSE_Y=np.zeros((nruns,m))
Acc=np.zeros((nruns,m,t))
Weighted_Acc=np.zeros((nruns,m,t))
TPR=np.zeros((nruns,m,t))
TNR=np.zeros((nruns,m,t))
PPV=np.zeros((nruns,m,t))
NPV=np.zeros((nruns,m,t))
AUC=np.zeros((nruns,m))
MCC=np.zeros((nruns,m,t))

### (3) Drawing synthetic data, running algorithm, and displaying results

In [50]:
np.random.seed(8750)
runlist = np.random.choice(1000,nruns,False) # Choose 100 random trials
for run in range(len(runlist)):
    
    """
    Generating data and scaling data
    """
    lselect=[]
    np.random.seed(runlist[run]) # Fixing trial seed
    t=time.time()
    Y,F,X,e,lselect,strue,sigma,select=draw_GP.draw_GP_ARD_lm(n,ntest,p,q,sigma2,corr,strue,ltrue,plot_YX=True,kern=kern,cop=False,r2=r2)
    
    Y = Y.reshape(n+ntest,1)
    F = F.reshape(n+ntest,1)
    
    Y = (Y-Y[:n].mean())/Y[:n].var()**0.5
    X = (X-X[:n].mean(0))/X[:n].var(0)**0.5
    F = (F-F[:n].mean())/F[:n].var()**0.5

    # Getting training and test set
    ytest=Y[n:]
    Xtest=X[n:]
    ftest=F[n:]
    y=Y[:n]
    X=X[:n]
    f=F[:n]
    print("data generated")
    if lselect.any():
        print("Length-scales are: ",lselect[select])
    print("Noise variance is: ",sigma**2)
    print("Average data variance is: ", np.mean(np.var(X,0)))
    print("Time taken to draw data : ", time.time()-t)
    
    """
    Running algorithm
    """
    args = []
    arg_vals = []
    for i in range(nmodels):
        args.append(["seed","subsample", "sampling_strat", "min_VBEM_iter", "max_VBEM_iter", "GP_fit_tol", "VBEM_tol", "max_GP_fit_iter", "init_GP_iter", "iter_remove", "print_VBEM", "learn_rate", "optimisation", "X_mult"])
        arg_vals.append([0,minibatch_size[i], sampling_strat[i], min_VBEM_iter[i], max_VBEM_iter[i], GPtol[i], 0.1/p, gp_iter[i],gp_iter[i], iter_remove[i], False, step[i], opt[i], X_mult[i]])

    test_algorithm = partial(diagnostics.get_pred_posterior_GP,reg = reg,kern = kernel_funcs.gaussian)

    Runtime[run], Lambda[run], L[run], L1norm[run], L2norm[run], MSE_F[run], MSE_Y[run], Acc[run], Weighted_Acc[run], TPR[run], TNR[run], PPV[run], NPV[run], AUC[run], MCC[run]= simulations.do_simulation_VBEMSSGP(
                               y, X, ftest, ytest, Xtest, q, algorithm_training = fit.VB_EM_GP_SS, algorithm_testing = test_algorithm, 
                               nmodels = m, args = args, arg_vals = arg_vals, post_fit = post_fit, SS_GP = SS_GP, post_var = post_var,
                               order_relevant_vars = False, order_irrelevant_vars = False, VS_threshs = VS_threshs, 
                               select = select, predict_selected = predict_selected, hyper_opt = hyper_opt, hyper_arg = ["v0","v1"],
                                hyper_vals = [1e+4*2**np.linspace(np.log2(100),-np.log2(100),11),1e-4*2**np.linspace(np.log2(100),-np.log2(100),11)], ltrue=lselect, MC_pred = MC_pred, model_select = model_select, post_fit_subsample=n, train = train,
                                model_weighting = model_weights)
    
    print("RUN {0}".format(run))
    print("Runtime mean is:", Runtime[:run+1].mean(0))
    print("Weighted accuracy mean is:", Weighted_Acc[:run].mean(0))
    print("TPR mean is:", TPR[:run+1].mean(0))
    print("PPV mean is:", PPV[:run+1].mean(0))
    print("MCC mean is:", MCC[:run+1].mean(0))
    print("L1norm mean is:", L1norm[:run+1].mean(0))
    print("L2norm mean is:", L2norm[:run+1].mean(0))
    print("MSE_F mean is:", MSE_F[:run+1].mean(0))
    print("MSE_Y mean is:", MSE_Y[:run+1].mean(0), "\n")
    

R2= 0.6650981093248958
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.027304068902007185
Average data variance is:  1.0
Time taken to draw data :  0.03789377212524414
run time is : 40.623807191848755
RUN 0
Runtime mean is: [40.63793755]
Weighted accuracy mean is: [[nan nan nan nan]]
TPR mean is: [[1.  1.  0.8 0. ]]
PPV mean is: [[0.17241379 0.45454545 0.57142857 0.        ]]
MCC mean is: [[ 0.35896605  0.65256212  0.65638034 -0.02305715]]
L1norm mean is: [1.51441338]
L2norm mean is: [0.43852963]
MSE_F mean is: [0.34777799]
MSE_Y mean is: [0.51430425] 

R2= 0.6721101071837331
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.025248529499040346
Average data variance is:  1.0000000000000002
Time taken to draw data :  0.03383898735046387


  print("Weighted accuracy mean is:", Weighted_Acc[:run].mean(0))
  ret = um.true_divide(


run time is : 26.25614047050476
RUN 1
Runtime mean is: [33.45443285]
Weighted accuracy mean is: [[0.87368421 0.96842105 0.88421053 0.49473684]]
TPR mean is: [[1.  1.  0.7 0.2]]
PPV mean is: [[0.17241379 0.64393939 0.78571429 0.5       ]]
MCC mean is: [[0.35896605 0.78030788 0.71147494 0.29982135]]
L1norm mean is: [1.33954773]
L2norm mean is: [0.37400693]
MSE_F mean is: [0.237427]
MSE_Y mean is: [0.41400265] 

R2= 0.6870532017129913
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.0243844951917865
Average data variance is:  1.0
Time taken to draw data :  0.03289389610290527
run time is : 40.37881541252136
RUN 2
Runtime mean is: [35.76754681]
Weighted accuracy mean is: [[0.87368421 0.98157895 0.84210526 0.59736842]]
TPR mean is: [[1.         1.         0.73333333 0.2       ]]
PPV mean is: [[0.15197957 0.51701223 0.64502165 0.5       ]]
MCC mean is: [[0.32385358 0.67809999 0.64295628 0.29820193]]
L1norm mean is: [1.86521994]
L2no

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  MCC[j,i]=(TP*TN-FP*FN)/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))


run time is : 21.011659622192383
RUN 49
Runtime mean is: [33.0404142]
Weighted accuracy mean is: [[0.79409237 0.86122449 0.8471536  0.68131042]]
TPR mean is: [[0.9   0.856 0.76  0.392]]
PPV mean is: [[0.13614267 0.30790264 0.46510226        nan]]
MCC mean is: [[0.27544461 0.45899766 0.55217089 0.41920126]]
L1norm mean is: [2.55629626]
L2norm mean is: [0.58583606]
MSE_F mean is: [0.35071986]
MSE_Y mean is: [0.49057808] 

R2= 0.6496737966259072
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.07348664634452057
Average data variance is:  1.0
Time taken to draw data :  0.03308582305908203
run time is : 41.63550806045532
RUN 50
Runtime mean is: [33.20923404]
Weighted accuracy mean is: [[0.79547368 0.86305263 0.84968421 0.68547368]]
TPR mean is: [[0.90196078 0.85490196 0.76078431 0.38823529]]
PPV mean is: [[0.13605319 0.30970847 0.4690545         nan]]
MCC mean is: [[0.27578963 0.46049385 0.55536069 0.41676522]]
L1norm mean is: [2.5

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  MCC[j,i]=(TP*TN-FP*FN)/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))


run time is : 26.981663465499878
RUN 59
Runtime mean is: [32.72056419]
Weighted accuracy mean is: [[0.8014273  0.87100803 0.85209634 0.69009813]]
TPR mean is: [[0.91333333 0.87       0.76       0.4       ]]
PPV mean is: [[0.13708457 0.32013393 0.48563453        nan]]
MCC mean is: [[0.28026075 0.47491101 0.56431862 0.43405735]]
L1norm mean is: [2.49991609]
L2norm mean is: [0.56918809]
MSE_F mean is: [0.3502473]
MSE_Y mean is: [0.49508092] 

R2= 0.6741326595966766
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.04539169882446855
Average data variance is:  0.9999999999999999
Time taken to draw data :  0.031882524490356445
run time is : 41.485352754592896
RUN 60
Runtime mean is: [32.86449756]
Weighted accuracy mean is: [[0.80131579 0.87236842 0.85096491 0.69017544]]
TPR mean is: [[0.9147541  0.87213115 0.76065574 0.4       ]]
PPV mean is: [[0.13688646 0.3191999  0.48587003        nan]]
MCC mean is: [[0.28027247 0.4748909  0.56504

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  MCC[j,i]=(TP*TN-FP*FN)/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))


run time is : 41.47565317153931
RUN 66
Runtime mean is: [32.98474634]
Weighted accuracy mean is: [[0.80271132 0.87424242 0.85645933 0.69210526]]
TPR mean is: [[0.91641791 0.87164179 0.77014925 0.40298507]]
PPV mean is: [[0.13636035 0.31836569 0.49204674        nan]]
MCC mean is: [[0.27992674 0.47492237 0.57398817 0.44007235]]
L1norm mean is: [2.47406166]
L2norm mean is: [0.56058365]
MSE_F mean is: [0.35205701]
MSE_Y mean is: [0.50037911] 

R2= 0.6645565215489438
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.012909384544755003
Average data variance is:  1.0
Time taken to draw data :  0.03427863121032715
run time is : 27.07520818710327
RUN 67
Runtime mean is: [32.89804415]
Weighted accuracy mean is: [[0.8017282  0.87368421 0.85679497 0.69214454]]
TPR mean is: [[0.91764706 0.87352941 0.77352941 0.40294118]]
PPV mean is: [[0.13672697 0.31981129 0.49531496        nan]]
MCC mean is: [[0.28084353 0.4770744  0.57784438 0.44275805]]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  MCC[j,i]=(TP*TN-FP*FN)/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))


run time is : 10.52881932258606
RUN 71
Runtime mean is: [32.74490296]
Weighted accuracy mean is: [[0.80340993 0.87598221 0.85915493 0.68932543]]
TPR mean is: [[0.91944444 0.875      0.775      0.4       ]]
PPV mean is: [[0.13619782 0.32484201 0.49459939        nan]]
MCC mean is: [[0.28022855 0.47962851 0.57691127 0.43457324]]
L1norm mean is: [2.49262884]
L2norm mean is: [0.5634541]
MSE_F mean is: [0.35136797]
MSE_Y mean is: [0.50145836] 

R2= 0.710224208966546
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.021197466980387332
Average data variance is:  1.0
Time taken to draw data :  0.030498981475830078
run time is : 42.12692213058472
RUN 72
Runtime mean is: [32.8736419]
Weighted accuracy mean is: [[0.80226608 0.8748538  0.85884503 0.69057018]]
TPR mean is: [[0.92054795 0.8739726  0.77260274 0.4       ]]
PPV mean is: [[0.13608833 0.32495833 0.49604323        nan]]
MCC mean is: [[0.28032017 0.47963449 0.57693916 0.43715032]]
L

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  MCC[j,i]=(TP*TN-FP*FN)/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))


run time is : 26.995935916900635
RUN 75
Runtime mean is: [32.71702067]
Weighted accuracy mean is: [[0.80414035 0.87375439 0.85677193 0.68814035]]
TPR mean is: [[0.92368421 0.87368421 0.77368421 0.39736842]]
PPV mean is: [[0.1368574  0.32789544 0.49510293        nan]]
MCC mean is: [[0.28227805 0.48077835 0.57653178 0.43406641]]
L1norm mean is: [2.48445322]
L2norm mean is: [0.56118487]
MSE_F mean is: [0.35037492]
MSE_Y mean is: [0.49868518] 

R2= 0.6479396023056606
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.06444900484460722
Average data variance is:  1.0
Time taken to draw data :  0.035400390625
run time is : 41.749282121658325
RUN 76
Runtime mean is: [32.83452407]
Weighted accuracy mean is: [[0.80443213 0.87430748 0.85817175 0.68954294]]
TPR mean is: [[0.91948052 0.87012987 0.76883117 0.39480519]]
PPV mean is: [[0.13613303 0.32580156 0.4923836         nan]]
MCC mean is: [[0.28003145 0.47779163 0.57289786 0.43139835]]
L1n

In [53]:
namelist = ["Runtime", "MSE_F", "MSE_Y", "Acc", "Weighted_Acc", "TPR", "TNR", "PPV", "NPV", "AUC", "MCC"]
objlist = [Runtime, MSE_F, MSE_Y, Acc, Weighted_Acc, TPR, TNR, PPV, NPV, AUC, MCC]
#iters = np.random.choice(1000,100,False)
iters = np.linspace(0,99,100).astype(int)

for i in range(len(objlist)):
    print("Mean {0} is:".format(namelist[i]), np.mean(objlist[i][iters],0))

print("\n")
for i in range(len(objlist)):
    print("Median {0} is:".format(namelist[i]), np.median(objlist[i][iters],0))

print("\n")
quant = 0.25
for i in range(len(objlist)):
    if namelist[i] in ["Runtime", "MSE_F", "MSE_Y"]:
        print("{1} quantile {0} is:".format(namelist[i], quant), np.quantile(objlist[i][iters],1-quant,0))
    else:
        print("{1} quantile {0} is:".format(namelist[i], quant), np.quantile(objlist[i][iters],quant,0))

Mean Runtime is: [32.26076068]
Mean MSE_F is: [0.34576179]
Mean MSE_Y is: [0.49366939]
Mean Acc is: [[0.6971 0.8705 0.9317 0.9516]]
Mean Weighted_Acc is: [[0.80552632 0.87215789 0.85794737 0.69221053]]
Mean TPR is: [[0.926 0.874 0.776 0.404]]
Mean TNR is: [[0.68505263 0.87031579 0.93989474 0.98042105]]
Mean PPV is: [[0.13701673 0.31683114 0.49173015        nan]]
Mean NPV is: [[0.99440781 0.99239925 0.98775432 0.96921215]]
Mean AUC is: [0.91734737]
Mean MCC is: [[0.28317771 0.47155297 0.57229775 0.43315725]]


Median Runtime is: [28.46202087]
Median MSE_F is: [0.33300526]
Median MSE_Y is: [0.47241939]
Median Acc is: [[0.69 0.87 0.94 0.95]]
Median Weighted_Acc is: [[0.82631579 0.87368421 0.87631579 0.69473684]]
Median TPR is: [[1.  1.  0.8 0.4]]
Median TNR is: [[0.68421053 0.88421053 0.94736842 0.98421053]]
Median PPV is: [[0.13513514 0.26666667 0.44949495        nan]]
Median NPV is: [[1.         1.         0.98907071 0.96907216]]
Median AUC is: [0.94842105]
Median MCC is: [[0.29304023 0

In [54]:
from datetime import date
Output = {"Runtime" : Runtime, "Lambda" : Lambda, "L" : L, "L1norm" : L1norm, "L2norm" : L2norm, "MSE_F" : MSE_F
        , "MSE_Y" : MSE_Y, "Acc" : Acc, "Weighted_Acc" : Weighted_Acc, "TPR" :TPR, "TNR" : TNR, "PPV" : PPV, "NPV" : NPV, "AUC" : AUC, "MCC" : MCC}
String = "Stage1_MLII{13}_{0}_{1}_lsmean={12}_r2={14}_l0={2}_b2={3}_step={4}_MCpred={5}_n={6}_p={7}_q={8}_kern={9}_runs={10}_start={11}".format(
    "GP", date.today(), l_init, beta2, step[0], MC_pred[0],n,p,q,str(kern)[23:28], nruns, "random", lsmean, minibatch_size, r2)
np.save(String, Output) # saving