# Gaussian process with Spike and Slab ARD prior - Stage 1 GP experiment
## Embedded variable selection algorithms testing

### (1) Importing and defining all required functions

In [1]:
"""
Importing required libraries and defining key functions
"""
# Requirements for algorithms
import numpy as np
import scipy.special as sp
from scipy.spatial.distance import cdist 

# Requirements For plots and diagnostics
import matplotlib.pyplot as plt
import scipy.stats as sps
import time
from sklearn.metrics import roc_auc_score
from datetime import date
import inspect
import pickle
from IPython.display import clear_output

# Algorithm functions
import os
os.chdir('C:/Users/hughw/Documents/MSC project/GP algorithms/Master function files')
from GP_funcs_ZTMFSS import kernel_funcs
from GP_funcs_ZTMFSS import draw_GP
from GP_funcs_ZTMFSS import simulations
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
os.chdir('C:/Users/hughw/Documents/MSC project/Simulation results')

### (2) Setting simulation parameters and models

In [2]:
"""
Importing in rpy2
"""

os.environ['R_USER'] = 'D:\Anaconda3\Lib\site-packages\rpy2'
import rpy2
print(rpy2.__version__)
import rpy2.robjects as robjects

from rpy2.robjects.packages import importr
# import R's "base" package
base = importr('base')
base.R_home()
# import R's "utils" package
utils = importr('utils')

# import rpy2's package module
import rpy2.robjects.packages as rpackages

# import R's utility package
utils = rpackages.importr('utils')

# select a mirror for R packages
utils.chooseCRANmirror(ind=1) # select the first mirror in the list

3.4.5


<rpy2.rinterface_lib.sexp.NULLType object at 0x000001202075E7C0> [RTYPES.NILSXP]

In [9]:
"""
Simulation controls
"""
# Simulation settings
n=300
ntest=100
p=100
q=5
corr=0.5
r2=0.9
sigma_X=1
strue=1
lsmean=0.25
ltrue =np.repeat(lsmean/q**0.5,p)
kern = kernel_funcs.gaussian
nruns = 100
m = 9


# Storage objects
Runtime=np.zeros((nruns, m))
MSE_F=np.zeros((nruns, m))
MSE_Y=np.zeros((nruns,m))
TPR=np.zeros((nruns,m))
TNR=np.zeros((nruns,m))
PPV=np.zeros((nruns,m))
NPV=np.zeros((nruns,m))
MCC=np.zeros((nruns,m))
RF_oob = np.zeros((nruns,3))

### (3) Running algorithm iterations, saving and displaying results

In [10]:
np.random.seed(8750)
runlist = np.random.choice(1000,100,False) # Choose 100 random trials
othermodels=False
SBGAM=True

for run in range(len(runlist)):
    
    """
    Generating data and scaling data
    """
    lselect=[]
    np.random.seed(runlist[run]) # Fixing trial seed
    t=time.time()
    Y,F,X,e,lselect,strue,sigma,select=draw_GP.draw_GP_ARD_lm(n,ntest,p,q,sigma_X,corr,strue,ltrue,plot_YX=True,kern=kern,cop=False,r2=r2)
    
    Y = Y.reshape(n+ntest,1)
    F = F.reshape(n+ntest,1)
    
    Y = (Y-Y.mean())/Y.var()**0.5
    F = (F-F.mean())/F.var()**0.5
    X = (X-X.mean(0))/X.var(0)**0.5

    # Getting training and test set
    ytest=Y[n:]
    Xtest=X[n:]
    ftest=F[n:]
    y=Y[:n]
    X=X[:n]
    f=F[:n]
    print("data generated")
    print("Noise variance is: ",sigma**2)
    print("Average data variance is: ", np.mean(np.var(X,0)))
    print(time.time()-t)
    
    """
    Exporting data to enable running of R scripts
    """
    os.chdir('C:/Users/hughw/Documents/MSC project/R scripts')
    np.save("y", y)
    np.save("f", f)
    np.save("X", X)
    np.save("ytest", ytest)
    np.save("ftest", ftest)
    np.save("Xtest", Xtest)
    np.save("select", select.astype(float))   
    
    if othermodels:
    
        """
        Running Random forest with VS+CV
        """
        
        # Setting up storage objects
        folds=5
        thresholds = ["1*mean", "2*mean", "4*mean", "8*mean", "16*mean"]
        errcv = np.zeros(len(thresholds))
        Selected_features = []
        
        t = time.time()
        
        # Shuffling the data and splitting into folds
        shuffled_indexes = np.random.choice(n,n,False)
        y_shuffle = y[shuffled_indexes]
        X_shuffle = X[shuffled_indexes]
        n_per_fold = int(n/folds)
        
        # Doing CV over thresholds
        for i in range(len(thresholds)):
            
            # Getting selected features          
            embedded_rf_selector = SelectFromModel(RandomForestRegressor(n_estimators=100), threshold = thresholds[i])
            embedded_rf_selector.fit(X, y.reshape(n,))
            embedded_rf_support = embedded_rf_selector.get_support()
            print(str(np.sum(embedded_rf_support)), 'selected features')
            selected_features = np.where(embedded_rf_support)[0]
            if len(selected_features)==0:
                selected_features = np.random.choice(p,1,False)
            Selected_features.append(selected_features)

            for f in range(folds):

                # Getting CVtraining and CVtest set
                y_cvtest = y_shuffle[(f*n_per_fold):min((f+1)*n_per_fold, n)]
                X_cvtest = X_shuffle[(f*n_per_fold):min((f+1)*n_per_fold, n)]

                if f>0:
                    y_cvtrain = y_shuffle[:(f*n_per_fold)]
                    X_cvtrain = X_shuffle[:(f*n_per_fold)]
                if f<(folds-1):
                    if f>0:
                        y_cvtrain = np.append(y_cvtrain,y_shuffle[(f+1)*n_per_fold:],0)
                        X_cvtrain = np.append(X_cvtrain,X_shuffle[(f+1)*n_per_fold:],0)
                    else:
                        y_cvtrain = y_shuffle[(f+1)*n_per_fold:]
                        X_cvtrain = X_shuffle[(f+1)*n_per_fold:]

                # Running training and predicting with RF
                model = RandomForestRegressor(n_estimators=100).fit(X_cvtrain[:,selected_features],y_cvtrain.reshape(len(y_cvtrain),))
                preds = model.predict(X_cvtest[:,selected_features])
                errcv[i] += np.mean((y_cvtest - preds)**2)
        
        # Getting best threshold and running final model
        best_threshold =np.where(errcv==np.min(errcv))[0][0]
        model = RandomForestRegressor(n_estimators=100).fit(X[:,Selected_features[best_threshold]],y.reshape(n,))
        preds = model.predict(Xtest[:,Selected_features[best_threshold]])
        Runtime[run,0] = time.time()-t
        
        # MSE
        MSE_Y[run,0] = simulations.MSE_pc(preds.reshape(ntest,1),ytest)
        MSE_F[run,0] = simulations.MSE_pc(preds.reshape(ntest,1),ftest)

        # VS accuracy
        forest_select = np.zeros(p)
        forest_select[Selected_features[best_threshold]]=1
        PPV[run,0]=np.mean(select[forest_select>0])
        NPV[run,0]=np.mean((1-select[forest_select==0]))
        TPR[run,0]=np.mean(forest_select[select>0])
        TNR[run,0]=np.mean((1-forest_select[select==0]))

        TP = np.sum(forest_select[select>0])
        TN = np.sum(1-forest_select[select==0])
        FP = np.sum(1-select[forest_select>0])
        FN = np.sum(select[forest_select==0])

        MCC[run,0]=(TP*TN-FP*FN)/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))

        if (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)==0:
            MCC[run,0]=0


        """
        Running Gboost with VS-CV
        """

        # Setting up storage objects
        folds=5
        thresholds = ["1*mean", "2*mean", "4*mean", "8*mean", "16*mean"]
        errcv = np.zeros(len(thresholds))
        Selected_features = []
        
        t = time.time()
        
        # Shuffling the data and splitting into folds
        shuffled_indexes = np.random.choice(n,n,False)
        y_shuffle = y[shuffled_indexes]
        X_shuffle = X[shuffled_indexes]
        n_per_fold = int(n/folds)
        
        # Doing CV over thresholds
        for i in range(len(thresholds)):
            
            # Getting selected features          
            embedded_rf_selector = SelectFromModel(GradientBoostingRegressor(n_estimators=100,learning_rate=0.01), threshold = thresholds[i])
            embedded_rf_selector.fit(X, y.reshape(n,))
            embedded_rf_support = embedded_rf_selector.get_support()
            print(str(np.sum(embedded_rf_support)), 'selected features')
            selected_features = np.where(embedded_rf_support)[0]
            if len(selected_features)==0:
                selected_features = np.random.choice(p,1,False)
            Selected_features.append(selected_features)

            for f in range(folds):

                # Getting CVtraining and CVtest set
                y_cvtest = y_shuffle[(f*n_per_fold):min((f+1)*n_per_fold, n)]
                X_cvtest = X_shuffle[(f*n_per_fold):min((f+1)*n_per_fold, n)]

                if f>0:
                    y_cvtrain = y_shuffle[:(f*n_per_fold)]
                    X_cvtrain = X_shuffle[:(f*n_per_fold)]
                if f<(folds-1):
                    if f>0:
                        y_cvtrain = np.append(y_cvtrain,y_shuffle[(f+1)*n_per_fold:],0)
                        X_cvtrain = np.append(X_cvtrain,X_shuffle[(f+1)*n_per_fold:],0)
                    else:
                        y_cvtrain = y_shuffle[(f+1)*n_per_fold:]
                        X_cvtrain = X_shuffle[(f+1)*n_per_fold:]

                # Running training and predicting with RF
                model = GradientBoostingRegressor(n_estimators=100,learning_rate=0.01).fit(X_cvtrain[:,selected_features],y_cvtrain.reshape(len(y_cvtrain),))
                preds = model.predict(X_cvtest[:,selected_features])
                errcv[i] += np.mean((y_cvtest - preds)**2)
        
        # Getting best threshold and running final model
        best_threshold =np.where(errcv==np.min(errcv))[0][0]
        model = GradientBoostingRegressor(n_estimators=100,learning_rate=0.01).fit(X[:,Selected_features[best_threshold]],y.reshape(n,))
        preds = model.predict(Xtest[:,Selected_features[best_threshold]])
        Runtime[run,1] = time.time()-t
        
        # MSE
        MSE_Y[run,1] = simulations.MSE_pc(preds.reshape(ntest,1),ytest)
        MSE_F[run,1] = simulations.MSE_pc(preds.reshape(ntest,1),ftest)

        # VS accuracy
        forest_select = np.zeros(p)
        forest_select[Selected_features[best_threshold]]=1
        PPV[run,1]=np.mean(select[forest_select>0])
        NPV[run,1]=np.mean((1-select[forest_select==0]))
        TPR[run,1]=np.mean(forest_select[select>0])
        TNR[run,1]=np.mean((1-forest_select[select==0]))

        TP = np.sum(forest_select[select>0])
        TN = np.sum(1-forest_select[select==0])
        FP = np.sum(1-select[forest_select>0])
        FN = np.sum(select[forest_select==0])

        MCC[run,1]=(TP*TN-FP*FN)/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))

        if (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)==0:
            MCC[run,1]=0
            
        """
        Running ncvreg
        """
        r=robjects.r
        output = r.source("ncvreg_code.R")
        Runtime[run,2:5]=output[0][0]
        MSE_F[run,2:5]=output[0][1]
        MSE_Y[run,2:5]=output[0][2]
        TPR[run,2:5]=output[0][3]
        TNR[run,2:5]=output[0][4]
        PPV[run,2:5]=output[0][5]
        NPV[run,2:5]=output[0][6]
        MCC[run,2:5]=output[0][7]


        """
        Running sfgam
        """
        r=robjects.r
        output = r.source("sparseGAM_code.R")
        Runtime[run,5:8]=output[0][0]
        MSE_F[run,5:8]=output[0][1]
        MSE_Y[run,5:8]=output[0][2]
        TPR[run,5:8]=output[0][3]
        TNR[run,5:8]=output[0][4]
        PPV[run,5:8]=output[0][5]
        NPV[run,5:8]=output[0][6]
        MCC[run,5:8]=output[0][7]

    """
    Runnig sbgam
    """
    if SBGAM:
        r=robjects.r
        output = r.source("sparseBayesGAM_code.R")
        Runtime[run,8]=np.array(output[0][0])
        MSE_F[run,8]=np.array(output[0][1])
        MSE_Y[run,8]=np.array(output[0][2])
        TPR[run,8]=np.array(output[0][3])
        TNR[run,8]=np.array(output[0][4])
        PPV[run,8]=np.array(output[0][5])
        NPV[run,8]=np.array(output[0][6])
        MCC[run,8]=np.array(output[0][7])
    
    """
    Removing files from directory
    """
    os.remove("y.npy")
    os.remove("f.npy")
    os.remove("X.npy")
    os.remove("ytest.npy")
    os.remove("ftest.npy")
    os.remove("Xtest.npy")
    os.remove("select.npy")

    """
    Printing out current results
    """
    print("RUN {0}".format(run))
    print("Runtime mean is:", Runtime[:run+1].mean(0))
    print("TPR mean is:", TPR[:run+1].mean(0))
    print("PPV mean is:", PPV[:run+1].mean(0))
    print("MCC mean is:", MCC[:run+1].mean(0))
    print("MSE_F mean is:", MSE_F[:run+1].mean(0))
    print("MSE_Y mean is:", MSE_Y[:run+1].mean(0), "\n")
    

R2= 0.8993636862772934
data generated
Noise variance is:  0.006067570867112703
Average data variance is:  0.9972629072424785
0.07080984115600586
Fold number 1 
Fold number 2 
Fold number 3 
lambda0 =  60 
RUN 0
Runtime mean is: [0.         0.         0.         0.         0.         0.
 0.         0.         1.05049987]
TPR mean is: [0.  0.  0.  0.  0.  0.  0.  0.  0.6]
PPV mean is: [0. 0. 0. 0. 0. 0. 0. 0. 1.]
MCC mean is: [0.         0.         0.         0.         0.         0.
 0.         0.         0.76656954]
MSE_F mean is: [0.         0.         0.         0.         0.         0.
 0.         0.         0.23702995]
MSE_Y mean is: [0.         0.         0.         0.         0.         0.
 0.         0.         0.36967159] 

R2= 0.9021920644185265
data generated
Noise variance is:  0.0056107843331200755
Average data variance is:  1.0170416340411146
0.059836626052856445
Fold number 1 
Fold number 2 
Fold number 3 
lambda0 =  60 
RUN 1
Runtime mean is: [0.         0.         0.   

In [12]:
namelist = ["Runtime", "MSE_F", "MSE_Y", "TPR", "TNR", "PPV", "NPV", "MCC"]
objlist = [Runtime, MSE_F, MSE_Y, TPR, TNR, PPV, NPV, MCC]
#iters = np.random.choice(1000,100,False)
iters = np.linspace(0,99,100).astype(int)

for i in range(len(objlist)):
    print("Mean {0} is:".format(namelist[i]), np.mean(objlist[i][iters],0))

print("\n")
for i in range(len(objlist)):
    print("Median {0} is:".format(namelist[i]), np.median(objlist[i][iters],0))

print("\n")
quant = 0.25
for i in range(len(objlist)):
    if namelist[i] in ["Runtime", "MSE_F", "MSE_Y"]:
        print("{1} quantile {0} is:".format(namelist[i], quant), np.quantile(objlist[i][iters],1-quant,0))
    else:
        print("{1} quantile {0} is:".format(namelist[i], quant), np.quantile(objlist[i][iters],quant,0))

Mean Runtime is: [ 0.          0.          0.          0.          0.          0.
  0.          0.         36.37381007]
Mean MSE_F is: [0.         0.         0.         0.         0.         0.
 0.         0.         0.24034256]
Mean MSE_Y is: [0.         0.         0.         0.         0.         0.
 0.         0.         0.34837011]
Mean TPR is: [0.    0.    0.    0.    0.    0.    0.    0.    0.606]
Mean TNR is: [0. 0. 0. 0. 0. 0. 0. 0. 1.]
Mean PPV is: [0. 0. 0. 0. 0. 0. 0. 0. 1.]
Mean NPV is: [0.         0.         0.         0.         0.         0.
 0.         0.         0.97978341]
Mean MCC is: [0.         0.         0.         0.         0.         0.
 0.         0.         0.76041299]


Median Runtime is: [ 0.          0.          0.          0.          0.          0.
  0.          0.         39.97057593]
Median MSE_F is: [0.         0.         0.         0.         0.         0.
 0.         0.         0.19172483]
Median MSE_Y is: [0.         0.         0.         0.       

# REMEMBER TO MULTIPLY RUNTIME BY 60

In [13]:
Output = {"Runtime" : Runtime, "MSE_F" : MSE_F
        , "MSE_Y" : MSE_Y, "TPR" :TPR, "TNR" : TNR, "PPV" : PPV, "NPV" : NPV, "MCC" : MCC}
String = "Stage1_GP_lsmean={0}_p={1}_embedded_sslasso".format(lsmean,p)
np.save(String, Output) # saving