This file is to test my code

I try to improve my code for HDF to make it better.

In [1]:
import sys
sys.path.append("../../mypkg")

In [2]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from numbers import Number

from easydict import EasyDict as edict
from tqdm import trange, tqdm
from scipy.io import loadmat
from pprint import pprint
import itertools
from scipy.stats import chi2


In [3]:
# This will reload all imports as soon as the code changes
%load_ext autoreload
%autoreload 2

In [4]:
from constants import DATA_ROOT, RES_ROOT, FIG_ROOT, MIDRES_ROOT
from default_paras import def_paras

from hdf_utils.data_gen import gen_covs, gen_simu_psd, gen_simu_psd_dataset
from hdf_utils.fns_sinica import coef_fn, fourier_basis_fn, gen_sini_Xthetas
from hdf_utils.likelihood import obt_lin_tm
from hdf_utils.SIS_ch import SIS_GLIM
from hdf_utils.utils import gen_lam_seq
from hdf_utils.hypo_test import  MS2idxs, obt_test_stat_simple2, obt_test_stat_simple3
from utils.matrix import col_vec_fn, col_vec2mat_fn, conju_grad, svd_inverse, cholesky_inv
from utils.functions import logit_fn
from utils.misc import save_pkl, load_pkl
from splines import obt_bsp_obasis_Rfn, obt_bsp_basis_Rfn_wrapper
from projection import euclidean_proj_l1ball
from optimization_ch.cross_validation import CV_err_linear_fn, CV_err_logi_fn
from optimization_ch.opt import optimization, HDFOpt
from optimization.variable_selection import GIC_fn, GCV_fn
from penalties.scad_pen import SCAD
from models.linear_model import LinearModel
from models.logistic_model import LogisticModel


from joblib import Parallel, delayed

In [5]:
plt.style.use(FIG_ROOT/"base.mplstyle")
torch.set_default_tensor_type(torch.DoubleTensor)

# Params

In [7]:
#obt_bsp = obt_bsp_obasis_Rfn
obt_bsp = obt_bsp_basis_Rfn_wrapper
np.random.seed(0)
paras = edict(def_paras.copy())



# Others
paras.num_rep = 200 
paras.init_noise_sd = -1 # the sd of the noise added to the true value for initial values, if -1, make init 0
paras.SIS_ratio = 0.2 # the ratio to keep with SIS procedure
#paras.SIS_ratio = 0.2 # the ratio to keep with SIS procedure
paras.SIS_pen = 0.02
paras.linear_theta_update="cholesky_inv"

# candidate sets of tuning parameters, only two 
# lambda: penalty term
# N: num of basis
paras.can_lams = [0.01, 0.1, 0.2, 0.3, 0.4, 0.6, 1, 2, 8]
paras.can_Ns = [4, 6, 8, 10, 12]


# generating dataset
paras.n = 200 # num of data obs to be genareted
paras.npts = 100 # num of pts to evaluate X(s)
paras.freqs = np.linspace(2, 45, paras.npts) # freqs
paras.d = 68 # num of ROIs
paras.q = 3 # num of other covariates
paras.sigma2 = 1 # variance of the error
# variance used for estimation, note that the value does not affect any results 
# as long as I tune the parameter properly
paras.norminal_sigma2 = 1 
paras.types_ = ["int", "c", 2]
paras.is_psd_std = False

# b-spline
paras.x = np.linspace(0, 1, paras.npts)
paras.basis_mats = []
for N in paras.can_Ns:
    paras.basis_mats.append(
        torch.tensor(obt_bsp(paras.x, N, paras.ord)).to(torch.get_default_dtype())
    )
paras.SIS_basis_mat = torch.tensor(obt_bsp(paras.x, 8, paras.ord)).to(torch.get_default_dtype())

# True parameters
paras.alp_GT = np.array([5, -1, 2])
# fourier basis
cs = [0.0, 0.0, 0.0] # for sinica paper
paras.fourier_basis = fourier_basis_fn(paras.x)[:, :]
paras.fourier_basis_coefs = ([cs[0]*coef_fn(0.2), cs[1]*coef_fn(0.2), cs[2]*coef_fn(0.2)] + 
                             [np.zeros(50)] * (paras.d-3-1) +
                             [coef_fn(0.2)]
                             )
paras.fourier_basis_coefs = np.array(paras.fourier_basis_coefs).T 
paras.beta_GT = paras.fourier_basis @ paras.fourier_basis_coefs * 2
beta_GT_norm = np.linalg.norm(paras.beta_GT, axis=0)
print(beta_GT_norm[beta_GT_norm!=0])

paras.Gam_GT_ests = [(np.linalg.inv(basis_mat.numpy().T 
                                  @ basis_mat.numpy()) 
                                  @ basis_mat.numpy().T 
                                  @ paras.beta_GT) 
                     for basis_mat in paras.basis_mats]

# optimization
# not used, to use it, you have to know GT
Rmins = [(2*(np.linalg.norm(paras.Gam_GT_ests[ix]
                            /np.sqrt(paras.can_Ns[ix]), axis=0).sum() 
           + np.abs(paras.alp_GT).sum())) 
        for ix in range(len(paras.can_Ns))]
paras.Rmin = np.max(Rmins)/5
paras.Rmin = 100000
paras.Rfct = 2
paras.stop_cv = 5e-4
paras.max_iter = 2000
paras.beta = 1.2 # default is 1, but will make a lot of iteration non-conv

paras.N_eps = 1e-4 # the stop criteria for Newton-Ralpson method, only for logistic model
paras.N_maxit = 100
paras.is_BFGS = "adaptive"

# CV
paras.cv_is_center = True
paras.cv_SIS_ratio = paras.SIS_ratio
paras.cv_SIS_pen = paras.SIS_pen
paras.cv_SIS_basis_mat = paras.SIS_basis_mat
paras.num_cv_fold = 5
paras.cv_init_noise_sd = paras.init_noise_sd


# hypothesis test
#without loss of generality, we assume the idxs in M is the first m betas
paras.sel_idx = np.arange(3, paras.d) # M^c set, 
paras.M_idxs = np.delete(np.arange(paras.d), paras.sel_idx) # the M set
paras.Cmats = [
    np.array([[1, 0, 0], [0, 1, -1]])
    #np.array([1, -1]).reshape(1, 2), # m x m I matrix, [beta1, beta2] = [0, 0]
    #np.eye(len(paras.M_idxs)), # m x m I matrix, [beta1, beta2] = [0, 0]
]
paras.svdinv_eps_Q = 1e-7 # now 0 means inverse, small value like 0.01 means remove small eig vals.
paras.svdinv_eps_Psi = 1e-7 


# saving path
paras.save_dir = RES_ROOT/"test"
if not paras.save_dir.exists():
    paras.save_dir.mkdir()

[29.74717132]


# logi

In [8]:
cur_data = gen_simu_psd_dataset(n=paras.n, d=paras.d, q=paras.q, types_=paras.types_, 
                            alp_GT=paras.alp_GT, beta_GT=paras.beta_GT, freqs=paras.freqs, 
                            data_type="logi", data_params={"sigma2":1, "err_dist": "t"}, 
                            seed=0, is_std=paras.is_psd_std);



In [9]:
lam = 0.5
N = 8
paras.SIS_ratio = 0.2
paras.lam = lam 
paras.N = N
paras.R = 10000
paras.basis_mat = paras.basis_mats[paras.can_Ns.index(N)]

hdf_fit = HDFOpt(is_std=True,  model_type="logi", verbose=2, is_orth_basis=False,
        SIS_ratio=paras.SIS_ratio, lam=paras.lam, N=paras.N, sel_idx=paras.sel_idx, opt_params={"beta":paras.beta, "R":paras.R})
hdf_fit.add_data(X=cur_data.X, Y=cur_data.Y, Z=cur_data.Z)
hdf_fit.fit()
hdf_fit.get_cv_est(5);

2023-12-29 19:33:52,323 - optimization_ch.opt - INFO - opt parmas is {'stop_cv': 0.0005, 'max_iter': 2000, 'inner_loop_verbose': 0, 'alpha': 0.9, 'beta': 1.2, 'R': 10000, 'N_eps': 0.0001, 'N_maxit': 100, 'is_BFGS': 'adaptive', 'linear_theta_update': 'cholesky_inv', 'linear_mat': None, 'is_full': False}.
2023-12-29 19:33:52,324 - optimization_ch.opt - INFO - SIS parmas is {'SIS_pen': 0.02, 'SIS_basis_N': 8, 'SIS_basis_ord': 4}.
2023-12-29 19:33:52,324 - optimization_ch.opt - INFO - model parmas is {'norminal_sigma2': 1}.
2023-12-29 19:33:52,325 - optimization_ch.opt - INFO - As cov_types is not provided, inferring the continuous covariates.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.66s/it]


In [10]:
cv_probs = CV_err_logi_fn(data=cur_data, 
                          num_cv_fold=5, 
                          is_prg=1, 
                          save_paras=False,    
                          input_paras=paras);

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:08<00:00,  1.61s/it]


In [11]:
hdf_fit.cv_Y_est - cv_probs

array([ 2.15288423e-06,  1.29239403e-07,  4.35164732e-06,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        7.69124499e-06,  0.00000000e+00,  2.26551896e-06,  0.00000000e+00,
        1.00259516e-05,  2.40196593e-06,  3.10513359e-07,  0.00000000e+00,
        1.49461365e-04,  1.53385871e-05,  0.00000000e+00,  0.00000000e+00,
        5.82569206e-08,  9.08694035e-07, -1.11022302e-16,  0.00000000e+00,
        4.13906838e-06,  0.00000000e+00,  1.08175779e-04,  3.07763978e-06,
        2.02546858e-07,  2.30564560e-06,  2.15935477e-07,  0.00000000e+00,
        9.20919362e-07,  0.00000000e+00,  0.00000000e+00,  1.42439143e-07,
        3.34702082e-05,  3.51702657e-05,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.21356741e-06,  2.72718697e-04, -4.55374807e-05,
       -6.67951091e-04,  3.63880417e-06,  0.00000000e+00,  0.00000000e+00,
       -4.14048632e-05, -3.10691398e-05,  9.67903312e-06,  0.00000000e+00,
        0.00000000e+00,  

# Linear

In [12]:
cur_data = gen_simu_psd_dataset(n=paras.n, d=paras.d, q=paras.q, types_=paras.types_, 
                            alp_GT=paras.alp_GT, beta_GT=paras.beta_GT, freqs=paras.freqs, 
                            data_type="linear", data_params={"sigma2":1, "err_dist": "t"}, 
                            seed=0, is_std=paras.is_psd_std);

In [19]:
lam = 0.5
N = 8
paras.SIS_ratio = 0.2
paras.lam = lam 
paras.N = N
paras.R = 10000
paras.basis_mat = paras.basis_mats[paras.can_Ns.index(N)]

hdf_fit = HDFOpt(is_std=True,  model_type="linear", verbose=2, is_orth_basis=False,
        SIS_ratio=paras.SIS_ratio, lam=paras.lam, N=paras.N, sel_idx=paras.sel_idx, opt_params={"beta":paras.beta, "R":paras.R, "is_full": True})
hdf_fit.add_data(X=cur_data.X, Y=cur_data.Y, Z=cur_data.Z)
hdf_fit.fit()
hdf_fit.get_cv_est(10);

2023-12-29 19:36:55,005 - optimization_ch.opt - INFO - opt parmas is {'stop_cv': 0.0005, 'max_iter': 2000, 'inner_loop_verbose': 0, 'alpha': 0.9, 'beta': 1.2, 'R': 10000, 'N_eps': 0.0001, 'N_maxit': 100, 'is_BFGS': 'adaptive', 'linear_theta_update': 'cholesky_inv', 'linear_mat': None, 'is_full': True}.
2023-12-29 19:36:55,006 - optimization_ch.opt - INFO - SIS parmas is {'SIS_pen': 0.02, 'SIS_basis_N': 8, 'SIS_basis_ord': 4}.
2023-12-29 19:36:55,007 - optimization_ch.opt - INFO - model parmas is {'norminal_sigma2': 1}.
2023-12-29 19:36:55,009 - optimization_ch.opt - INFO - As cov_types is not provided, inferring the continuous covariates.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  8.28it/s]


In [17]:
cv_errs = CV_err_linear_fn(data=cur_data, 
                            num_cv_fold=10, 
                            is_prg=1, 
                            save_paras=False,    
                            input_paras=paras);

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  7.69it/s]


In [18]:
cv_errs - hdf_fit.cv_Y_est

array([-5.39568390e-14,  4.28546088e-14, -1.46549439e-14, -8.88178420e-15,
        1.42108547e-14,  1.95399252e-14,  3.99680289e-14, -5.32907052e-15,
       -2.66453526e-15, -7.54951657e-15,  1.72639680e-14,  1.21014310e-14,
       -4.14113188e-14, -7.10542736e-15,  1.15463195e-14, -1.77635684e-14,
        5.68434189e-14, -2.59792188e-14,  4.44089210e-16, -2.39808173e-14,
       -2.08721929e-14,  8.88178420e-16,  3.55271368e-14,  1.77635684e-14,
       -1.06581410e-14,  6.21724894e-15,  1.50990331e-14, -9.76996262e-15,
        9.76996262e-15,  2.22044605e-14,  1.77635684e-14,  7.99360578e-15,
       -2.17603713e-14, -5.77315973e-15, -1.11022302e-14, -8.43769499e-15,
        0.00000000e+00, -1.42108547e-14, -1.15463195e-14,  2.57571742e-14,
       -1.44328993e-14,  1.33226763e-14,  3.10862447e-15, -4.44089210e-16,
        3.55271368e-15, -7.54951657e-15,  7.10542736e-15,  1.77635684e-15,
       -2.66453526e-15, -1.33226763e-14, -1.33226763e-14, -5.77315973e-15,
        1.46549439e-14,  