# Notebook for Testing different confidence bands
The code below is used to approximate the quantiles

In [1]:
import numpy as np
import math
import sys
import os
import itertools
import time
from tqdm import tqdm
from joblib import Parallel, delayed
from IPython.display import display, HTML
from scipy import stats
from scipy.sparse import csr_matrix
from scipy import special
from numpy.linalg import cholesky
import matplotlib.pyplot as plt
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import splu
src_path = os.path.abspath(os.path.join(os.getcwd(), "../src"))
sys.path.append(src_path)
from forest_v2 import RegressionTreeModel
from forest_v2 import RandomForestModel
import functions as fcts

# Bootstrap CB Test mit gaussian multiplier

In [3]:
# dimension of the feature space
p = 2

#regression function
regression_fct=fcts.m_p2_01

# depth of the trees
k = 5 #5,6

# smaller k for evaluation grid (choice justified later)
k2=4 # sufficient (shown in other notebook)

#Ehrenfest parameters
B = 12
delta = 7

# sample size
n_samples = 250
#subsample factor
r = 0.75

#confidence levels
beta=np.array([0.1,0.05,0.01])
np.set_printoptions(suppress=True)

In [4]:
# if necessary create a text file for the results of the simulations
if not os.path.isfile('Bootstrap results.txt'):
    with open('Bootstrap results.txt', 'w') as f:
        f.write( "Simulation results of b confidence bands"+ '\n')
    print("The file 'Bootstrap results.txt' was created.")
else:
    print("The file 'Bootstrap results.txt' already existed.")

The file 'Bootstrap results.txt' already existed.


In [5]:
#grid for supremum calculation
eps=1/2**20
g=2**k2
splits=np.linspace(0, 1, num=g, endpoint=False)
xl=splits+eps
xr=splits+1/g-eps
axis_grid=np.sort(np.concatenate((xl,xr)))
prod=list(itertools.product(axis_grid, repeat=p))
grid=np.array(prod)

In [7]:
def bootstrap_run(j):
    boot_rng = np.random.default_rng([rand_seed,i,j])
    
    X_B = boot_rng.random(n_samples*p).reshape(n_samples,p)
    e_hat = boot_rng.choice(residuals, size=n_samples, replace=True)*boot_rng.normal(0,1,n_samples)

    model_boot = RandomForestModel(n_trees=n_trees, max_depth=k,sample_size_fct=r,tree_type=rf_type,delta=delta,B=B)
    
    model_boot.train(X_B,e_hat)# randomness from numpy random seed
    
    boot_pred_grid = model_boot.predict(X_test)
    B_sup = np.max(np.abs(boot_pred_grid))
    return B_sup

In [10]:
# parameters to vary:

# number of cpus used
n_cpus=2

# number of confidence bands
n_tests=1000

#number of bootstrap samples
n_boot=100

#random seed for both data and tree construction (two generators for replicable results)
rand_seed=123456

# vector of error std, used: [0.75,1,1.25]
sigs=[1]#[0.75,1,1.25]

#vector of error distributions, possible entries: 'norm', 'uni, 'tdx' where x is a place holder for the degrees of freedom
distributions=['td6']#,['norm','uni','td4','td6'] 

# vector with number of trees
n_trees_vec=[50]#,100] 

#multiplier to in/exclude the regression function
# 1 for the normal regression model, 0 to set m=0 to test the asymptotic distribution without approximation error
m_factor=1

rf_types=['Uni','Ehr']#,'Ehr']#,'Ehr']

#_____________________________________________________________________________________________________

X_test=grid 
m_true_grid=m_factor*regression_fct(X_test)

for sigma in sigs:
    for dist_name in distributions:
        if dist_name == "norm":
            error_dist=stats.norm(loc=0, scale=sigma)
        elif dist_name == "uni":
            error_dist=stats.uniform(loc=-0.5*sigma*np.sqrt(12), scale=sigma*np.sqrt(12))
        elif dist_name[:2]=="td":
            try:
                df=int(dist_name[2:len(dist_name)])
            except Exception as e:
                print(f"An error occured: {e}.", "Distribution not known. Skipped to next distribution.")
                continue
            error_dist=stats.t(df=df,loc=0,scale = sigma* np.sqrt((df-2)/df))
        else:
            print("Distribution not known. Skipped to next distribution.")
            continue
        
        for n_trees in n_trees_vec:

            for rf_type in rf_types:     
                
                if rf_type=='Uni': 
                    model = RandomForestModel(n_trees=n_trees, max_depth=k,sample_size_fct=r,tree_type="Uni")
                elif rf_type=='Ehr':
                    model = RandomForestModel(n_trees=n_trees, max_depth=k,sample_size_fct=r,tree_type="Ehr",delta=delta,B=B)
                else:
                    print("Forest type not known. Skipped to next one.")
                    continue

                sups=[]
                quants=np.empty((0,len(beta)))

                np.random.seed(rand_seed) 
                data_rng = np.random.default_rng(rand_seed)

                print("Simulation progress for RF type "+rf_type +" and error distribution "+dist_name+" with sigma =",sigma," and n_trees =",n_trees, ":")
                
                for i in tqdm(range(n_tests)):
                    #data from a different random generator than the one used for the random forests
                    e = error_dist.rvs(size=n_samples,random_state=data_rng)
                    X = data_rng.random(n_samples*p).reshape(n_samples,p)
                    m = m_factor*regression_fct(X)
                    Y=m+e
                    
                    model.clear()
                    model.train(X,Y) #randomness from default numpy generator

                    X_preds = model.predict(X)
                    residuals = Y-X_preds
                    
                    m_hat_grid = model.predict(X_test)
                    sup=np.max(abs(m_true_grid-m_hat_grid))
                    sups.append(sup)

                    B_sups = Parallel(n_jobs=n_cpus)(
                        delayed(bootstrap_run)(j)
                        for j in range(n_boot)
                    )
                    
                    quants=np.append(quants,np.quantile(B_sups,1-beta).reshape((1,3)),axis=0)

                n_CB=len(sups)
                cover_num=np.sum(np.array(sups).reshape((n_CB,1))<quants,axis=0)             

                avg_cb_rad=np.mean(quants, axis=0)

                if m_factor==1:
                    m_info = " and m = "+regression_fct.__name__
                elif m_factor ==0:
                    m_info = " and m set to zero."
                else:
                    m_info = " and m = "+str(m_factor)+"*"+regression_fct.__name__
                
                result_txt = ["","","Results  at "+time.ctime(),
                              "for Random Seed = "+str(rand_seed),
                              "and Regression model with:",
                              "p = "+str(p)+m_info,
                              "RF type:"+rf_type,
                              "Parameters:",
                              "Sample size: "+str(n_samples),
                              "k="+str(k),
                              "r="+str(r*n_samples)]
                if error_dist.dist.name == 't':
                    result_txt.append("Error distribution: t-Distribution with "+str(df)+" degrees of freedom")
                else:
                    result_txt.append("Error distribution: " +str(error_dist.dist.name))
                
                result_txt.append("Error std: "+str(sigma))
                result_txt.append("Number of trees: "+str(n_trees))
                result_txt.append("Number of CBS: "+str(n_CB))
                result_txt+=["","Empirical Coverage for confidence bands with theoretical coverage "+ str(1-beta)+":"]
                result_txt.append("Number: "+str(cover_num)+", percentage: "+ str(cover_num/n_CB))
                result_txt+=["","Average confidence band radius for theoretical coverage "+ str(1-beta)+":"]
                result_txt.append(str(avg_cb_rad))
                                
                # save results of combination in txt file
                with open('Bootstrap results.txt', 'a') as f:
                    for line in result_txt:
                        f.write(line + '\n')
                
                print("Simulation for "+rf_type+"-RF with distribution "+dist_name+" with sigma =",sigma," and n_trees =",n_trees, " complete.")

Simulation progress for RF type Uni and error distribution td6 with sigma = 1  and n_trees = 50 :


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [5:48:24<00:00, 20.90s/it]


Simulation for Uni-RF with distribution td6 with sigma = 1  and n_trees = 50  complete.
Simulation progress for RF type Ehr and error distribution td6 with sigma = 1  and n_trees = 50 :


100%|████████████████████████████████████████████████████████████████████████████| 1000/1000 [6:59:39<00:00, 25.18s/it]

Simulation for Ehr-RF with distribution td6 with sigma = 1  and n_trees = 50  complete.



