# Gaussian process with Spike and Slab ARD prior - Stage 1 GP experiment
## LR VI algorithm (continuous spike) - non additive kernels

### (1) Importing and defining all required functions

In [1]:
"""
Importing in libraries for SGD-SS-GP
"""
# Requirements for algorithms
import numpy as np
import scipy.special as sp
from functools import partial
from scipy.spatial.distance import cdist

# Requirements for simulations/monitoring algorithms
import time
from IPython.display import clear_output
import inspect
from sklearn.metrics import roc_auc_score

# Requirements For plots and diagnostics
import matplotlib.pyplot as plt
import scipy.stats as sps
from mpl_toolkits import mplot3d

# Make plots inline
%matplotlib inline

"""
Importing algorithm functions
"""
import os
os.chdir('C:/Users/hughw/Documents/MSC project/GP algorithms/Master function files')
from GP_funcs_FRSS import kernel_funcs
from GP_funcs_FRSS import model_funcs
from GP_funcs_FRSS import draw_GP
from GP_funcs_FRSS import fit
from GP_funcs_FRSS import diagnostics
from GP_funcs_FRSS import simulations
from functools import partial
os.chdir('C:/Users/hughw/Documents/MSC project/Simulation results')

### (2) Setting simulation parameters and models

In [2]:
"""
Simulation controls
"""
# Simulation settings
n=300
ntest=100
p=100
q=5
corr=0.5 # used for GP draw
r2=0.9 # used for GP draw
lsmean=0.25
sigma2=1
ltrue=np.ones(p)*lsmean/q**0.5
strue=1

nruns = 50


# Model run settings
nmodels = 1
VS_threshs = [[0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99],
             [0.5,0.9,0.95,0.99]]
iter_remove = [False,False,False,False,False,False,False]
sampling_strat = ["unif","unif", "unif", "unif", "unif", "unif", "unif"]
minibatch_size = [150,150,75,75,n,n]
GPtol=[ 1e-5,1e-5,1e-5,1e-5,1e-5,1e-5]
SS_GP=[ True,True,True,True,True,True]
MC_pred = [True,True,True,True,True,True,]
predict_selected = [False,False,False,False,False,False]
post_fit = [False,False,False,False,False,False]
train = [True,False,True, False, True,False] 
step = [0.025,0.025,0.025,0.025,0.0025,0.0025]
hyper_opt = [True,True,True,True,True,True]
model_select = [False,False,True,False,True,False,True]
post_var = [False,True,True,True,True,True,True]
model_weights = ["elpd", "elpd","elpd", "elpd","elpd", "elpd"]
min_VBEM_iter = [5,5,5,5,5,5]
max_VBEM_iter = [10,10,10,10,10,10]
gp_iter = [100,100,100,100,100,100]
opt = ["amsgrad", "amsgrad", "amsgrad", "amsgrad", "amsgrad", "amsgrad"]
l_init = 0.01
beta2=0.99
t = len(VS_threshs[0])
kern=kernel_funcs.gaussian
grad_kern=kernel_funcs.grad_gaussian
reg = 0.01
NN_pred = False
newsumgrads = False
VBtol=0.1/p
temp=1
v0=1e+4
v1=1e-4
postfit = "mixed"
# SEE BELOW FOR ARGVALS

# Storage objects
m=nmodels
Runtime=np.zeros((nruns, m))
Lambda = np.zeros((nruns, m, p))
L = np.zeros((nruns, m, p))
L1norm=np.zeros((nruns, m))
L2norm=np.zeros((nruns, m))
MSE_F=np.zeros((nruns, m))
MSE_Y=np.zeros((nruns,m))
Acc=np.zeros((nruns,m,t))
Weighted_Acc=np.zeros((nruns,m,t))
TPR=np.zeros((nruns,m,t))
TNR=np.zeros((nruns,m,t))
PPV=np.zeros((nruns,m,t))
NPV=np.zeros((nruns,m,t))
AUC=np.zeros((nruns,m))
MCC=np.zeros((nruns,m,t))

### (3) Drawing synthetic data, running algorithm, and displaying results

In [None]:
np.random.seed(8750)
runlist = np.random.choice(1000,nruns,False) # Choose 100 random trials
for run in range(len(runlist)):
#for run in [27,30]:
    
    """
    Generating data and scaling data
    """
    lselect=[]
    np.random.seed(runlist[run]) # Fixing trial seed
    t=time.time()
    Y,F,X,e,lselect,strue,sigma,select=draw_GP.draw_GP_ARD_lm(n,ntest,p,q,sigma2,corr,strue,ltrue,plot_YX=True,kern=kern,cop=False,r2=r2)
    
    Y = Y.reshape(n+ntest,1)
    F = F.reshape(n+ntest,1)
    
    Y = (Y-Y[:n].mean())/Y[:n].var()**0.5
    X = (X-X[:n].mean(0))/X[:n].var(0)**0.5
    F = (F-F[:n].mean())/F[:n].var()**0.5

    # Getting training and test set
    ytest=Y[n:]
    Xtest=X[n:]
    ftest=F[n:]
    y=Y[:n]
    X=X[:n]
    f=F[:n]
    print("data generated")
    if lselect.any():
        print("Length-scales are: ",lselect[select])
    print("Noise variance is: ",sigma**2)
    print("Average data variance is: ", np.mean(np.var(X,0)))
    print("Time taken to draw data : ", time.time()-t)
    
    """
    Running algorithm
    """
    args = []
    arg_vals = []
    for i in range(nmodels):
        args.append(["k","L0","seed","subsample","svi_subsample", "sampling_strat", "min_VBEM_iter", "max_VBEM_iter", "GP_fit_tol", "VBEM_tol", "max_GP_fit_iter", "ZT_init_iter", "iter_remove", "print_VBEM", "learn_rate", "optimisation", "final_prune"])
        arg_vals.append([10,1e-2,0,minibatch_size[i], 5,sampling_strat[i], min_VBEM_iter[i], max_VBEM_iter[i], GPtol[i], 0.1/p, gp_iter[i],gp_iter[i], iter_remove[i], False, step[i], opt[i], True])

    test_algorithm = partial(diagnostics.get_pred_posterior_GP,reg = reg,kern = kernel_funcs.gaussian)

    Runtime[run], Lambda[run], L[run], V, L1norm[run], L2norm[run], MSE_F[run], MSE_Y[run], Acc[run], Weighted_Acc[run], TPR[run], TNR[run], PPV[run], NPV[run], AUC[run], MCC[run]= simulations.do_simulation_VBEMSSGP(
                               y, X, ftest, ytest, Xtest, q, algorithm_training = fit.VB_EM_GP_SS, algorithm_testing = test_algorithm, 
                               nmodels = m, args = args, arg_vals = arg_vals, SS_GP = SS_GP, post_var = post_var,
                               order_relevant_vars = False, order_irrelevant_vars = False, VS_threshs = VS_threshs, 
                               select = select, predict_selected = predict_selected, hyper_opt = hyper_opt, hyper_arg = ["v0","v1"],
                                hyper_vals = [1e+4*2**np.linspace(np.log2(100),-np.log2(100),11),1e-4*2**np.linspace(np.log2(100),-np.log2(100),11)], ltrue=lselect, MC_pred = MC_pred, model_select = model_select, post_fit_subsample=n, train = train,
                                model_weighting = model_weights)
    
    print("RUN {0}".format(run))
    print("Runtime mean is:", Runtime[:run+1].mean(0))
    print("Weighted accuracy mean is:", Weighted_Acc[:run].mean(0))
    print("TPR mean is:", TPR[:run+1].mean(0))
    print("PPV mean is:", PPV[:run+1].mean(0))
    print("MCC mean is:", MCC[:run+1].mean(0))
    print("L1norm mean is:", L1norm[:run+1].mean(0))
    print("L2norm mean is:", L2norm[:run+1].mean(0))
    print("MSE_F mean is:", MSE_F[:run+1].mean(0))
    print("MSE_Y mean is:", MSE_Y[:run+1].mean(0), "\n")
    

R2= 0.8993636862772934
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.006067570867112703
Average data variance is:  1.0
Time taken to draw data :  0.0399928092956543
run time is : 62.80036211013794
run time is : 116.27208852767944
run time is : 116.88312315940857
run time is : 116.30704283714294
run time is : 118.93616676330566
run time is : 117.8506920337677
run time is : 63.3923761844635
run time is : 63.44168734550476
run time is : 66.93832111358643
run time is : 64.53914260864258
100
57
16
13
7
6
5
4
1
0
1
RUN 0
Runtime mean is: [1018.52270198]
Weighted accuracy mean is: [[nan nan nan nan]]
TPR mean is: [[1.  1.  1.  0.8]]
PPV mean is: [[0.71428571 1.         1.         1.        ]]
MCC mean is: [[0.83621057 1.         1.         0.88975652]]
L1norm mean is: [0.56567615]
L2norm mean is: [0.21481617]
MSE_F mean is: [0.02522313]
MSE_Y mean is: [0.14490854] 

R2= 0.9021920644185265
data generated
Length-scales are:  [0.1118

  print("Weighted accuracy mean is:", Weighted_Acc[:run].mean(0))
  ret = um.true_divide(


run time is : 63.94723129272461
run time is : 117.13794827461243
run time is : 116.82170486450195
run time is : 116.34641551971436
run time is : 117.3544921875
run time is : 116.29967451095581
run time is : 65.83924055099487
run time is : 64.80109977722168
run time is : 63.754558086395264
run time is : 63.6512188911438
100
54
13
11
5
5
5
3
2
0
1
RUN 1
Runtime mean is: [1016.80560338]
Weighted accuracy mean is: [[0.98947368 1.         1.         0.9       ]]
TPR mean is: [[1.  1.  1.  0.9]]
PPV mean is: [[0.58441558 1.         1.         1.        ]]
MCC mean is: [[0.74438635 1.         1.         0.94487826]]
L1norm mean is: [0.56781646]
L2norm mean is: [0.24230234]
MSE_F mean is: [0.01936317]
MSE_Y mean is: [0.11727905] 

R2= 0.9080835121498346
data generated
Length-scales are:  [0.1118034 0.1118034 0.1118034 0.1118034 0.1118034]
Noise variance is:  0.0054187767092858845
Average data variance is:  1.0
Time taken to draw data :  0.032912254333496094
run time is : 65.2457594871521
run t

In [20]:
    args = []
    arg_vals = []
    for i in range(nmodels):
        args.append(["k","L0","seed","subsample","svi_subsample", "sampling_strat", "min_VBEM_iter", "max_VBEM_iter", "GP_fit_tol", "VBEM_tol", "max_GP_fit_iter", "ZT_init_iter", "iter_remove", "print_VBEM", "learn_rate", "optimisation", "final_prune"])
        arg_vals.append([10,1e-2,0,minibatch_size[i], 1,sampling_strat[i], min_VBEM_iter[i], max_VBEM_iter[i], GPtol[i], 0.1/p, gp_iter[i],gp_iter[i], iter_remove[i], False, step[i], opt[i], True])

best_pair,selection_path,losses,Results =  fit.hyper_opt_SSGP(y, X, fit.VB_EM_GP_SS, test_algorithm, ["v0","v1"],  [1e+4*2**np.linspace(np.log2(100),-np.log2(100),11),1e-4*2**np.linspace(np.log2(100),-np.log2(100),11)], 
                                                                              method =  "ML", training_args=args[0],training_arg_vals=arg_vals[0])

run time is : 13.273494243621826
run time is : 24.671021461486816
run time is : 24.805387020111084
run time is : 25.049259424209595
run time is : 25.03174090385437
run time is : 20.72325897216797
run time is : 13.662130117416382
run time is : 13.686549186706543
run time is : 13.683055877685547
run time is : 13.817994356155396


In [21]:
for i in range(11):
    print(np.sum(Results[i][0]!=0))
    
Results[5][0]

100
51
15
11
5
5
5
3
2
0
0


array([0.30021596, 0.17543065, 0.13085911, 0.12242215, 0.3022712 ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [24]:
logevidences = np.zeros(len(Results))
for i in range(len(Results)):
    logevidences[i] =  diagnostics.get_pred_posterior_GP_NN_CV(y,X,Results[i],0.01,kern=kernel_funcs.gaussian,NN=n, fraction=1,post_var=True, print_=False, use_tree=False, leaf_size=100, seed=0, MC_iters = 1)
    print(i)



0
1
2
3
4
5
6
7
8
9
10


In [38]:
PIP,Ls,Ss,Sigs = np.zeros((len(Results), p)),np.zeros((len(Results), p)),np.zeros((len(Results), )),np.zeros((len(Results), ))
weights = np.zeros(len(Results))
max_logevidence = np.max(logevidences)

# Do weighting
for i in range(len(Results)):
    logevidence = logevidences[i]
    if logevidence >= max_logevidence-500:
        if model_select[0]:
            weights[i] = (logevidence==max_logevidence)*1
        else:
            weights[i] = np.exp(logevidence-max_logevidence)
    PIP[i] = Results[i][4]
    Ls[i] = np.abs(Results[i][0])
    Ss[i] = Results[i][2]
    Sigs[i] = Results[i][3]
weights = weights/weights.sum()
l = [Ls.T @ weights]
s = np.sum(Ss*weights)
sig = np.sum(Sigs*weights)
lmbda = PIP.T @ weights

In [39]:
lmbda

array([1.        , 0.9999808 , 0.99806386, 0.99806384, 1.        ,
       0.04053772, 0.04054718, 0.02478614, 0.04053195, 0.46577035,
       0.024798  , 0.0247825 , 0.02477985, 0.04053404, 0.04054442,
       0.02478292, 0.02478354, 0.02478224, 0.02478824, 0.04053222,
       0.04053262, 0.02479715, 0.04053549, 0.02478259, 0.46577172,
       0.02478486, 0.04053511, 0.040534  , 0.02478523, 0.04053166,
       0.02478298, 0.04053283, 0.28241989, 0.0405363 , 0.02478207,
       0.02478627, 0.02478273, 0.02477585, 0.0247836 , 0.0247749 ,
       0.02478266, 0.02478282, 0.0405294 , 0.04053209, 0.04053172,
       0.04053314, 0.04053201, 0.04053608, 0.04057536, 0.04053448,
       0.02478288, 0.28243073, 0.02478225, 0.02478245, 0.28241819,
       0.02478438, 0.04053252, 0.02478393, 0.04054659, 0.02478303,
       0.02478505, 0.02478249, 0.04053452, 0.04053808, 0.02478838,
       0.02478244, 0.02478343, 0.46577087, 0.02478328, 0.04053272,
       0.02477452, 0.02478244, 0.02478253, 0.04054576, 0.02478

In [None]:
namelist = ["Runtime", "MSE_F", "MSE_Y", "Acc", "Weighted_Acc", "TPR", "TNR", "PPV", "NPV", "AUC", "MCC"]
objlist = [Runtime, MSE_F, MSE_Y, Acc, Weighted_Acc, TPR, TNR, PPV, NPV, AUC, MCC]
#iters = np.random.choice(1000,100,False)
iters = np.linspace(0,99,100).astype(int)

for i in range(len(objlist)):
    print("Mean {0} is:".format(namelist[i]), np.mean(objlist[i][iters],0))

print("\n")
for i in range(len(objlist)):
    print("Median {0} is:".format(namelist[i]), np.median(objlist[i][iters],0))

print("\n")
quant = 0.25
for i in range(len(objlist)):
    if namelist[i] in ["Runtime", "MSE_F", "MSE_Y"]:
        print("{1} quantile {0} is:".format(namelist[i], quant), np.quantile(objlist[i][iters],1-quant,0))
    else:
        print("{1} quantile {0} is:".format(namelist[i], quant), np.quantile(objlist[i][iters],quant,0))

In [None]:
from datetime import date
Output = {"Runtime" : Runtime, "Lambda" : Lambda, "L" : L, "L1norm" : L1norm, "L2norm" : L2norm, "MSE_F" : MSE_F
        , "MSE_Y" : MSE_Y, "Acc" : Acc, "Weighted_Acc" : Weighted_Acc, "TPR" :TPR, "TNR" : TNR, "PPV" : PPV, "NPV" : NPV, "AUC" : AUC, "MCC" : MCC}
String = "Stage1_LR{13}_{0}_{1}_lsmean={12}_r2={14}_l0={2}_b2={3}_step={4}_MCpred={5}_n={6}_p={7}_q={8}_kern={9}_runs={10}_start={11}".format(
    "GP", date.today(), l_init, beta2, step[0], MC_pred[0],n,p,q,str(kern)[23:28], nruns, "random", lsmean, minibatch_size, r2)
np.save(String, Output) # saving