This file is to test my code

I try to improve my code for HDF to make it better.

In [1]:
import sys
sys.path.append("../../mypkg")

In [2]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from numbers import Number

from easydict import EasyDict as edict
from tqdm import trange, tqdm
from scipy.io import loadmat
from pprint import pprint
import itertools
from scipy.stats import chi2


In [3]:
# This will reload all imports as soon as the code changes
%load_ext autoreload
%autoreload 2

In [4]:
from constants import DATA_ROOT, RES_ROOT, FIG_ROOT, MIDRES_ROOT
from default_paras import def_paras

from hdf_utils.data_gen import gen_covs, gen_simu_psd
from hdf_utils.fns_sinica import coef_fn, fourier_basis_fn, gen_sini_Xthetas
from hdf_utils.likelihood import obt_lin_tm
from hdf_utils.SIS_ch import SIS_GLIM
from hdf_utils.utils import gen_lam_seq
from hdf_utils.hypo_test import  MS2idxs, obt_test_stat_simple2, obt_test_stat_simple3
from utils.matrix import col_vec_fn, col_vec2mat_fn, conju_grad, svd_inverse, cholesky_inv
from utils.functions import logit_fn
from utils.misc import save_pkl, load_pkl
from splines import obt_bsp_obasis_Rfn, obt_bsp_basis_Rfn_wrapper
from projection import euclidean_proj_l1ball
from optimization.cross_validation import CV_err_linear_fn
from optimization_ch.opt import optimization, HDF_opt
from optimization.opt import optimization as optimization1
from optimization.variable_selection import GIC_fn, GCV_fn
from penalties.scad_pen import SCAD
from models.linear_model import LinearModel
from models.logistic_model import LogisticModel


from joblib import Parallel, delayed

In [5]:
plt.style.use(FIG_ROOT/"base.mplstyle")
torch.set_default_tensor_type(torch.DoubleTensor)

# Linear model

## Params

In [35]:
obt_bsp = obt_bsp_obasis_Rfn
#obt_bsp = obt_bsp_basis_Rfn_wrapper
np.random.seed(0)
paras = edict(def_paras.copy())



# Others
paras.num_rep = 200 
paras.init_noise_sd = -1 # the sd of the noise added to the true value for initial values, if -1, make init 0
paras.SIS_ratio = 0.2 # the ratio to keep with SIS procedure
#paras.SIS_ratio = 0.2 # the ratio to keep with SIS procedure
paras.SIS_pen = 0.02
paras.linear_theta_update="cholesky_inv"

# candidate sets of tuning parameters, only two 
# lambda: penalty term
# N: num of basis
paras.can_lams = [0.01, 0.1, 0.2, 0.3, 0.4, 0.6, 1, 2, 8]
paras.can_Ns = [4, 6, 8, 10, 12]


# generating dataset
paras.n = 200 # num of data obs to be genareted
paras.npts = 100 # num of pts to evaluate X(s)
paras.freqs = np.linspace(2, 45, paras.npts) # freqs
paras.d = 68 # num of ROIs
paras.q = 3 # num of other covariates
paras.sigma2 = 1 # variance of the error
# variance used for estimation, note that the value does not affect any results 
# as long as I tune the parameter properly
paras.norminal_sigma2 = 1 
paras.types_ = ["int", "c", 2]
paras.is_std = False

# b-spline
paras.x = np.linspace(0, 1, paras.npts)
paras.basis_mats = []
for N in paras.can_Ns:
    paras.basis_mats.append(
        torch.tensor(obt_bsp(paras.x, N, paras.ord)).to(torch.get_default_dtype())
    )
paras.SIS_basis_mat = torch.tensor(obt_bsp(paras.x, 8, paras.ord)).to(torch.get_default_dtype())

# True parameters
paras.alp_GT = np.array([5, -1, 2])
# fourier basis
cs = [0.0, 0.0, 0.0] # for sinica paper
paras.fourier_basis = fourier_basis_fn(paras.x)[:, :]
paras.fourier_basis_coefs = ([cs[0]*coef_fn(0.2), cs[1]*coef_fn(0.2), cs[2]*coef_fn(0.2)] + 
                             [np.zeros(50)] * (paras.d-3-1) +
                             [coef_fn(0.2)]
                             )
paras.fourier_basis_coefs = np.array(paras.fourier_basis_coefs).T 
paras.beta_GT = paras.fourier_basis @ paras.fourier_basis_coefs * 2
beta_GT_norm = np.linalg.norm(paras.beta_GT, axis=0)
print(beta_GT_norm[beta_GT_norm!=0])

paras.Gam_GT_ests = [(np.linalg.inv(basis_mat.numpy().T 
                                  @ basis_mat.numpy()) 
                                  @ basis_mat.numpy().T 
                                  @ paras.beta_GT) 
                     for basis_mat in paras.basis_mats]

# optimization
# not used, to use it, you have to know GT
Rmins = [(2*(np.linalg.norm(paras.Gam_GT_ests[ix]
                            /np.sqrt(paras.can_Ns[ix]), axis=0).sum() 
           + np.abs(paras.alp_GT).sum())) 
        for ix in range(len(paras.can_Ns))]
paras.Rmin = np.max(Rmins)/5
paras.Rmin = 100000
paras.Rfct = 2
paras.stop_cv = 5e-4
paras.max_iter = 2000
paras.beta = 1.2 # default is 1, but will make a lot of iteration non-conv

# CV
paras.cv_is_center = True
paras.cv_SIS_ratio = paras.SIS_ratio
paras.cv_SIS_pen = paras.SIS_pen
paras.cv_SIS_basis_mat = paras.SIS_basis_mat
paras.num_cv_fold = 5
paras.cv_init_noise_sd = paras.init_noise_sd


# hypothesis test
#without loss of generality, we assume the idxs in M is the first m betas
paras.sel_idx = np.arange(3, paras.d) # M^c set, 
paras.M_idxs = np.delete(np.arange(paras.d), paras.sel_idx) # the M set
paras.Cmats = [
    np.array([[1, 0, 0], [0, 1, -1]])
    #np.array([1, -1]).reshape(1, 2), # m x m I matrix, [beta1, beta2] = [0, 0]
    #np.eye(len(paras.M_idxs)), # m x m I matrix, [beta1, beta2] = [0, 0]
]
paras.svdinv_eps_Q = 1e-7 # now 0 means inverse, small value like 0.01 means remove small eig vals.
paras.svdinv_eps_Psi = 1e-7 


# saving path
paras.save_dir = RES_ROOT/"test"
if not paras.save_dir.exists():
    paras.save_dir.mkdir()

[29.74717132]


## Fns

In [36]:
def _is_exists(tmp_paras):
    """
    Check if a file with the given parameters exists.

    Args:
    tmp_paras:
        d (int): The value of d in the file name.
        n (int): The value of n in the file name.
        npts:
        is_std
        seed (int): The seed value in the file name.

    Returns:
    bool or Path: Returns the file path if the file exists, otherwise returns False.
    """
    _get_n = lambda fil: int(fil.stem.split("_")[2].split("-")[-1])
    fils = MIDRES_ROOT.glob(f"PSD_d-{tmp_paras.d}_n-*npts-{tmp_paras.npts}_is_std-{tmp_paras.is_std}")
    # We do not need fil with n as we know the data with corresponding seed does not exist
    fils = [fil for fil in fils if _get_n(fil) !=tmp_paras.n]
    if len(fils) == 0:
        return False
    else:
        fils = sorted(fils, key=_get_n)
        ns = np.array([_get_n(fil) for fil in fils])
        idxs = np.where(tmp_paras.n <= ns)[0]
        if len(idxs) == 0:
            return False
        else:
            fil =fils[idxs[0]]
            path = MIDRES_ROOT/fil/f"seed_{tmp_paras.seed}.pkl"
            return path if path.exists() else False
def _get_filename(params):
    keys = ["d", "n", "npts", "is_std"]
    folder_name = 'PSD_'+'_'.join(f"{k}-{params[k]}" for k in keys)
    return folder_name + f'/seed_{params.seed}.pkl'
def _gen_simu_data_all(seed, paras, verbose=False, is_gen=False):
    """
    Generate simulated data for all parameters.

    Args:
        seed (int): Seed for random number generator.
        paras (dict): Dictionary containing the following parameters:
            - n (int): Number of samples.
            - d (int): Number of dimensions.
            - q (int): Number of covariates.
            - types_ (list): List of types for generating covariates.
            - alp_GT (list): List of ground truth alpha values.
            - beta_GT (list): List of ground truth beta values.
            - freqs (list): List of frequencies for generating simulated PSD.
            - sigma2 (float): Variance of the noise.
        verbose(bool): Verbose or not
        is_gen(bool): Only for generating or not. If True, only checking or generating X, not return anything.

    Returns:
        all_data (dict): Dictionary containing the following simulated data:
            - X (torch.Tensor): Tensor of shape (n, d, npts) containing the simulated PSD.
            - Y (torch.Tensor): Tensor of shape (n,) containing the response variable.
            - Z (torch.Tensor): Tensor of shape (n, q) containing the covariates.
    """
    np.random.seed(seed)
    _paras = edict(paras.copy())
    # simulated PSD
    assert len(_paras.types_) == _paras.q
    assert len(_paras.alp_GT) == _paras.q
    tmp_paras = edict()
    tmp_paras.seed = seed 
    tmp_paras.n = _paras.n
    tmp_paras.d = _paras.d
    tmp_paras.npts = _paras.npts
    tmp_paras.is_std = _paras.is_std
    con_idxs = [typ =="c" for typ in _paras.types_]
    
    file_path = MIDRES_ROOT/_get_filename(tmp_paras)
    if file_path.exists():
        if is_gen:
            return None
        simu_curvs = load_pkl(file_path, verbose=verbose)
    else:
        ofil =  _is_exists(tmp_paras)
        if ofil:
            if is_gen:
                return None
            simu_curvs = load_pkl(ofil, verbose=verbose)
        else:
            if _paras.is_std:
                simu_curvs = gen_simu_psd(_paras.n, _paras.d, _paras.freqs, prior_sd=10, n_jobs=28, is_prog=False, is_std=_paras.is_std)
            else:
                simu_curvs = gen_simu_psd(_paras.n, _paras.d, _paras.freqs, prior_sd=10, n_jobs=28, is_prog=False, is_std=_paras.is_std)
                simu_curvs = simu_curvs - simu_curvs.mean(axis=-1, keepdims=True); # not std, but center it
            save_pkl(file_path, simu_curvs, verbose=verbose)
    if is_gen:
        return None
    simu_curvs = simu_curvs[:_paras.n]
    simu_curvs = (simu_curvs + np.random.randn(*simu_curvs.shape)*10)*1 # larger
    #simu_curvs = np.random.randn(_paras.n, _paras.d, _paras.npts)* 10
    simu_covs = gen_covs(_paras.n, _paras.types_)
    
    # linear term and Y
    int_part = np.sum(_paras.beta_GT.T* simu_curvs[:, :, :], axis=1).mean(axis=1)
    cov_part = simu_covs @ _paras.alp_GT 
    
    # linear term
    lin_term = cov_part + int_part
    
    # Y 
    errs_raw = np.random.standard_t(df=3, size=paras.n)                                                                                                                                                   
    errs = np.sqrt(_paras.sigma2)*(errs_raw - errs_raw.mean())/errs_raw.std()
    Y = lin_term + errs
    #Y = lin_term + np.random.randn(_paras.n)*np.sqrt(_paras.sigma2)
    
    # center
    X_centered = simu_curvs - simu_curvs.mean(axis=0, keepdims=True)
    Y_centered = Y - Y.mean(axis=0, keepdims=True)
    # this step is not necessary for simulation as I did so in generating data step                          
    # but for real data, plz do this
    Z_std = simu_covs.copy()
    Z_std[:, con_idxs] = ((simu_covs[:, con_idxs] - simu_covs[:, con_idxs].mean(axis=0, keepdims=True))
                          /simu_covs[:, con_idxs].std(axis=0, keepdims=True))
    
    # To torch
    X = torch.Tensor(X_centered) # n x d x npts
    Z = torch.Tensor(Z_std) # n x q
    Y = torch.Tensor(Y_centered)
    
    all_data = edict()
    all_data.X = X
    all_data.Y = Y
    all_data.Z = Z
    all_data.lin_term = lin_term
    return all_data


## Simu

In [39]:
seed = 0
N = 8
lam = 0.5
torch.set_default_tensor_type(torch.DoubleTensor)
np.random.seed(seed)
torch.manual_seed(seed)
        
_paras = edict(paras.copy())
_paras.Rv = 100000
_paras.R = 100000
_paras.seed = seed
_paras.lam = lam
_paras.N = N
_paras.basis_mat = _paras.basis_mats[_paras.can_Ns.index(N)]
_paras.Gam_GT_est = paras.Gam_GT_ests[_paras.can_Ns.index(N)]
cur_data = _gen_simu_data_all(_paras.seed, _paras)
    
    
    
# do sure independent screening for dim reduction
if _paras.SIS_ratio < 1:
    keep_idxs, _ = SIS_GLIM(Y=cur_data.Y, X=cur_data.X, Z=cur_data.Z, 
                             basis_mat=_paras.SIS_basis_mat, 
                             keep_ratio=_paras.SIS_ratio, 
                             model_type="linear", 
                             SIS_pen=_paras.SIS_pen, 
                             sel_idx=_paras.sel_idx)
else:
    keep_idxs = _paras.sel_idx
M_idxs = np.delete(np.arange(_paras.d), _paras.sel_idx)
_paras.keep_idxs = np.sort(np.concatenate([M_idxs, keep_idxs]))
    
_paras.sel_idx_SIS = np.where(np.array([keep_idx in _paras.sel_idx for keep_idx in _paras.keep_idxs]))[0]
_paras.d_SIS = len(_paras.keep_idxs)

cur_data_SIS = edict(cur_data.copy())
cur_data_SIS.X = cur_data.X[:, _paras.keep_idxs, :]


if _paras.init_noise_sd < 0:
    alp_init = torch.zeros(_paras.q)
    Gam_init = torch.zeros(_paras.N, _paras.d_SIS)
    theta_init = torch.cat([alp_init, col_vec_fn(Gam_init)/np.sqrt(_paras.N)])
    rhok_init = torch.zeros(_paras.d_SIS*_paras.N)
else:
    alp_init = torch.Tensor(_paras.alp_GT) + torch.randn(_paras.q)*_paras.init_noise_sd
    Gam_init = torch.Tensor(_paras.Gam_GT_est[:, _paras.keep_idxs]) + torch.randn(_paras.N, _paras.d_SIS)*_paras.init_noise_sd
    theta_init = torch.cat([alp_init, col_vec_fn(Gam_init)/np.sqrt(_paras.N)])
    rhok_init = torch.randn(_paras.d_SIS*_paras.N)
    
model = LinearModel(Y=cur_data_SIS.Y, 
                    X=cur_data_SIS.X, 
                    Z=cur_data_SIS.Z, 
                    basis_mat=_paras.basis_mat, 
                    sigma2=_paras.norminal_sigma2)
# 3e0
pen = SCAD(lams=_paras.lam, a=_paras.a,  sel_idx=_paras.sel_idx_SIS)
    

#main_res1 = optimization1(model=model, 
#                         penalty=pen, 
#                         inits=[alp_init, Gam_init, theta_init, rhok_init],
#                         is_prg=True,
#                         save_paras=False,    
#                         input_paras=_paras)
main_res = optimization(model=model, 
                        penalty=pen, 
                        inits=[Gam_init, theta_init, rhok_init],
                        verbose=2,
                        alpha=_paras.alpha,
                        beta=_paras.beta, 
                        R = _paras.R,
                        stop_cv=_paras.stop_cv,
                        max_iter=_paras.max_iter,
                        linear_mat=None, 
                        linear_theta_update=_paras.linear_theta_update,
                        inner_loop_verbose=0, 
                       )




2023-12-28 17:09:10,771 - optimization_ch.opt - INFO - The paras is {'stop_cv': 0.0005, 'max_iter': 2000, 'inner_loop_verbose': 0, 'alpha': 0.9, 'beta': 1.2, 'R': 100000, 'N_eps': 0.0001, 'N_maxit': 100, 'is_BFGS': 'adaptive', 'linear_theta_update': 'cholesky_inv', 'linear_mat': None, 'q': 3, 'N': 8}.
  2%|██▏                                                                                                | 43/2000 [00:00<00:03, 611.13it/s, error=0.00076, GamL0=14, CV=0.0005]


(<optimization_ch.one_step_opt.OneStepOpt at 0x7f0037efb280>, (44, 2000))

In [42]:
main_res2 = HDF_opt(X=cur_data.X, Y=cur_data.Y, Z=cur_data.Z, is_std=True, 
        SIS_ratio=_paras.SIS_ratio, lam=lam, N=N, sel_idx=_paras.sel_idx, opt_params={"beta":1.2})

2023-12-28 17:36:46,128 - optimization_ch.opt - INFO - opt parmas is {'stop_cv': 0.0005, 'max_iter': 2000, 'inner_loop_verbose': 0, 'alpha': 0.9, 'beta': 1.2, 'R': 100000.0, 'N_eps': 0.0001, 'N_maxit': 100, 'is_BFGS': 'adaptive', 'linear_theta_update': 'cholesky_inv', 'linear_mat': None, 'is_full': False}.
2023-12-28 17:36:46,130 - optimization_ch.opt - INFO - SIS parmas is {'SIS_ratio': 0.2, 'SIS_pen': 0.02, 'SIS_basis_N': 8, 'SIS_basis_ord': 4}.
2023-12-28 17:36:46,131 - optimization_ch.opt - INFO - model parmas is {'norminal_sigma2': 1}.
2023-12-28 17:36:46,131 - optimization_ch.opt - INFO - As cov_types is not provided, inferring the continuous covariates.
2023-12-28 17:36:46,202 - optimization_ch.opt - INFO - The paras is {'stop_cv': 0.0005, 'max_iter': 2000, 'inner_loop_verbose': 0, 'alpha': 0.9, 'beta': 1.2, 'R': 100000.0, 'N_eps': 0.0001, 'N_maxit': 100, 'is_BFGS': 'adaptive', 'linear_theta_update': 'cholesky_inv', 'linear_mat': None, 'q': 3, 'N': 8}.
  2%|██▏                  

(<optimization_ch.one_step_opt.OneStepOpt at 0x7f00b6842490>, (44, 2000))

In [43]:
main_res[0].Gamk- main_res2[0].Gamk

tensor([[ 1.4211e-14,  4.4270e-15, -4.1217e-15,  0.0000e+00,  1.5266e-15,
          2.5206e-15, -2.4078e-15,  3.5705e-15,  1.7686e-15, -1.9880e-15,
         -1.6202e-15,  3.8997e-15,  3.2301e-15,  0.0000e+00, -6.8695e-16,
         -1.0436e-14],
        [ 3.7748e-15, -1.0408e-16, -5.4123e-16,  0.0000e+00,  3.8580e-15,
          1.4728e-15,  2.6021e-15,  1.2334e-15,  1.0619e-15,  7.0430e-16,
         -6.1409e-16, -3.6984e-15,  1.0825e-15,  0.0000e+00, -2.8172e-15,
         -1.1324e-14],
        [ 6.5547e-15, -4.8984e-15,  1.2768e-15,  0.0000e+00, -1.4710e-15,
         -1.2262e-16,  9.7838e-16, -1.8319e-15,  1.1419e-15,  6.9389e-17,
          7.6675e-16,  1.5613e-15, -1.2351e-15,  0.0000e+00,  2.1684e-16,
         -2.2760e-15],
        [-4.8850e-15, -5.8287e-16, -6.9389e-16,  0.0000e+00,  5.7593e-16,
         -3.4851e-15, -1.7070e-15,  2.8623e-16, -3.3307e-15, -4.7184e-16,
         -1.4710e-15,  1.5717e-15, -7.2858e-17,  0.0000e+00,  1.9221e-15,
          2.2204e-15],
        [-2.9837e-15

# Logi model

In [7]:
np.random.seed(0)
obt_bsp = obt_bsp_obasis_Rfn
#obt_bsp = obt_bsp_basis_Rfn_wrapper
paras = edict(def_paras.copy())



# Others
paras.num_rep = 200 
paras.init_noise_sd = -1 # the sd of the noise added to the true value for initial values, if -1, make init 0
paras.SIS_ratio = 0.20 # the ratio to keep with SIS procedure
paras.SIS_pen = 0.02 
paras.linear_theta_update="cholesky_inv"

# candidate sets of tuning parameters, only two 
# lambda: penalty term
# N: num of basis
paras.can_lams = [0.01, 0.1, 0.2, 0.3, 0.4, 0.6, 1, 2, 8] # for non
paras.can_lams = [0.001, 0.3, 0.6, 0.8, 1, 1.2, 1.4, 2, 16] # for orthogonal basis
paras.can_Ns = [4, 6, 8, 10, 12]


# generating dataset
paras.n = 200 # num of data obs to be genareted
paras.npts = 100 # num of pts to evaluate X(s)
paras.d = 68 # num of ROIs
paras.q = 3 # num of other covariates
paras.types_ = ["int", "c", 2]
paras.is_std = False # std PSD or not


# b-spline
paras.x = np.linspace(0, 1, paras.npts)
paras.basis_mats = []
for N in paras.can_Ns:
    paras.basis_mats.append(
        torch.tensor(obt_bsp(paras.x, N, paras.ord)).to(torch.get_default_dtype())
    )
paras.SIS_basis_mat = torch.tensor(obt_bsp(paras.x, 8, paras.ord)).to(torch.get_default_dtype())

# True parameters
# Need to adaptively change intercept for each setting
# to make such 1 and 0 is balanced
paras.alp_GT0 = np.array([-1.0, 2.0]) # not include intercept
#paras.alp_GT0 = np.array([-1, 2]) # not include intercept
paras.intercept_cans = np.linspace(-30, 1, 20) 
# fourier basis
cs = [0.0, 0.0, 0.0] # for sinica paper
paras.fourier_basis = fourier_basis_fn(paras.x)[:, :]
paras.fourier_basis_coefs = ([cs[0]*coef_fn(0.2), cs[1]*coef_fn(0.2), cs[2]*coef_fn(0.2)] + 
                             [np.zeros(50)] * (paras.d-3-1) +
                             [coef_fn(0.2)]
                             )
paras.fourier_basis_coefs = np.array(paras.fourier_basis_coefs).T 
paras.beta_GT = paras.fourier_basis @ paras.fourier_basis_coefs * 2 
print(np.linalg.norm(paras.beta_GT, axis=0))

paras.Gam_GT_ests = [(np.linalg.inv(basis_mat.numpy().T 
                                  @ basis_mat.numpy()) 
                                  @ basis_mat.numpy().T 
                                  @ paras.beta_GT) 
                     for basis_mat in paras.basis_mats]

# optimization
# not used, to use it, you have to know GT
#Rmins = [(2*(np.linalg.norm(paras.Gam_GT_ests[ix]
#                            /np.sqrt(paras.can_Ns[ix]), axis=0).sum() 
#           + np.abs(paras.alp_GT0).sum()+10)) 
#        for ix in range(len(paras.can_Ns))]
#paras.Rmin = np.max(Rmins)
paras.Rmin = 100000
paras.Rfct = 2
paras.stop_cv = 5e-4
paras.max_iter = 2000
paras.num_cv_fold = 5

paras.N_eps = 1e-4 # the stop criteria for Newton-Ralpson method, only for logistic model
paras.N_maxit = 100
paras.is_BFGS = "adaptive"

paras.cv_is_center = True
paras.cv_SIS_ratio = paras.SIS_ratio
paras.cv_SIS_pen = paras.SIS_pen
paras.cv_SIS_basis_mat = paras.SIS_basis_mat
paras.num_cv_fold = 5
paras.cv_init_noise_sd = paras.init_noise_sd


# hypothesis test
#without loss of generality, we assume the idxs in M is the first m betas
paras.sel_idx = np.arange(1, paras.d) # M^c set, 
paras.M_idxs = np.delete(np.arange(paras.d), paras.sel_idx) # the M set
paras.Cmats = [
    np.eye(len(paras.M_idxs)), # m x m I matrix, [beta1, beta2] = [0, 0]
    #np.array([1, -1]).reshape(1, -1), # beta1-beta2=0
]
paras.svdinv_eps_Q = 1e-7 # now 0 means inverse, small value like 0.01 means remove small eig vals.
paras.svdinv_eps_Psi = 1e-7

paras.save_dir = RES_ROOT/"simu_linear_sinica_samebetaX_tmp"
if not paras.save_dir.exists():
    paras.save_dir.mkdir()

[ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.         29.74717132]


In [8]:
def _is_exists(tmp_paras):
    """
    Check if a file with the given parameters exists.

    Args:
    tmp_paras:
        d (int): The value of d in the file name.
        n (int): The value of n in the file name.
        npts:
        is_std
        seed (int): The seed value in the file name.

    Returns:
    bool or Path: Returns the file path if the file exists, otherwise returns False.
    """
    _get_n = lambda fil: int(fil.stem.split("_")[2].split("-")[-1])
    fils = MIDRES_ROOT.glob(f"PSD_d-{tmp_paras.d}_n-*npts-{tmp_paras.npts}_is_std-{tmp_paras.is_std}")
    # We do not need fil with n as we know the data with corresponding seed does not exist
    fils = [fil for fil in fils if _get_n(fil) !=tmp_paras.n]
    if len(fils) == 0:
        return False
    else:
        fils = sorted(fils, key=_get_n)
        ns = np.array([_get_n(fil) for fil in fils])
        idxs = np.where(tmp_paras.n <= ns)[0]
        if len(idxs) == 0:
            return False
        else:
            fil =fils[idxs[0]]
            path = MIDRES_ROOT/fil/f"seed_{tmp_paras.seed}.pkl"
            return path if path.exists() else False
def _get_filename(params):
    keys = ["d", "n", "npts", "is_std"]
    folder_name = 'PSD_'+'_'.join(f"{k}-{params[k]}" for k in keys)
    return folder_name + f'/seed_{params.seed}.pkl'
def _gen_simu_data_all1(seed, paras, verbose=False, is_gen=False):
    """
    Generate simulated data for all parameters.

    Args:
        seed (int): Seed for random number generator.
        paras (dict): Dictionary containing the following parameters:
            - n (int): Number of samples.
            - d (int): Number of dimensions.
            - q (int): Number of covariates.
            - types_ (list): List of types for generating covariates.
            - alp_GT (list): List of ground truth alpha values.
            - beta_GT (list): List of ground truth beta values.
            - freqs (list): List of frequencies for generating simulated PSD.
            - sigma2 (float): Variance of the noise.
        verbose(bool): Verbose or not
        is_gen(bool): Only for generating or not. If True, only checking or generating X, not return anything.

    Returns:
        all_data (dict): Dictionary containing the following simulated data:
            - X (torch.Tensor): Tensor of shape (n, d, npts) containing the simulated PSD.
            - Y (torch.Tensor): Tensor of shape (n,) containing the response variable.
            - Z (torch.Tensor): Tensor of shape (n, q) containing the covariates.
    """
    np.random.seed(seed)
    _paras = edict(paras.copy())
    # simulated PSD
    assert len(_paras.types_) == _paras.q
    assert len(_paras.alp_GT) == _paras.q
    con_idxs = [typ =="c" for typ in _paras.types_]
    tmp_paras = edict()
    tmp_paras.seed = seed 
    tmp_paras.n = _paras.n
    tmp_paras.d = _paras.d
    tmp_paras.npts = _paras.npts
    tmp_paras.is_std = _paras.is_std
    
    file_path = MIDRES_ROOT/_get_filename(tmp_paras)
    if file_path.exists():
        if is_gen:
            return None
        simu_curvs = load_pkl(file_path, verbose=verbose)
    else:
        ofil =  _is_exists(tmp_paras)
        if ofil:
            if is_gen:
                return None
            simu_curvs = load_pkl(ofil, verbose=verbose)
        else:
            if _paras.is_std:
                simu_curvs = gen_simu_psd(_paras.n, _paras.d, _paras.freqs, prior_sd=10, n_jobs=28, is_prog=False, is_std=_paras.is_std)
            else:
                simu_curvs = gen_simu_psd(_paras.n, _paras.d, _paras.freqs, prior_sd=10, n_jobs=28, is_prog=False, is_std=_paras.is_std)
                simu_curvs = simu_curvs - simu_curvs.mean(axis=-1, keepdims=True); # not std, but center it
            save_pkl(file_path, simu_curvs, verbose=verbose)
    if is_gen:
        return None
    simu_curvs = simu_curvs[:_paras.n]
    simu_curvs = (simu_curvs + np.random.randn(*simu_curvs.shape)*10)*1 # larger
    #simu_curvs = np.random.randn(_paras.n, _paras.d, _paras.npts)* 10
    simu_covs = gen_covs(_paras.n, _paras.types_)
    
    # linear term and Y
    int_part = np.sum(_paras.beta_GT.T* simu_curvs[:, :, :], axis=1).mean(axis=1)
    cov_part = simu_covs @ _paras.alp_GT 
    
    # linear term
    lin_term = cov_part + int_part
    probs = logit_fn(lin_term)
    
    # Y 
    Y = np.random.binomial(1, probs, size=len(probs))
    
    # center
    X_centered = simu_curvs - simu_curvs.mean(axis=0, keepdims=True)
    # this step is not necessary for simulation as I did so in generating data step
    # but for real data, plz do this
    Z_std = simu_covs.copy()
    Z_std[:, con_idxs] = ((simu_covs[:, con_idxs] - simu_covs[:, con_idxs].mean(axis=0, keepdims=True))
                          /simu_covs[:, con_idxs].std(axis=0, keepdims=True))
    
    
    # To torch
    X = torch.Tensor(X_centered) # n x d x npts
    Z = torch.Tensor(Z_std) # n x q
    Y = torch.Tensor(Y)
    
    all_data = edict()
    all_data.X = X
    all_data.Y = Y
    all_data.Z = Z
    all_data.lin_term = lin_term
    all_data.cov_part = cov_part 
    all_data.int_part = int_part
    all_data.simu_curvs = simu_curvs
    return all_data


In [10]:
ress = []
for inte in tqdm(paras.intercept_cans):
    tmp_paras = edict(paras.copy())
    tmp_paras.alp_GT = np.concatenate([[inte], paras.alp_GT0])
    _run_fn = lambda seed: _gen_simu_data_all1(seed, tmp_paras).Y.numpy()
    with Parallel(n_jobs=20) as parallel:
        res = parallel(delayed(_run_fn)(seed) for seed in range(100))
    ress.append(np.array(res))
    
# get the intercept
Yms = np.array([res.mean() for res in ress])
intercept = paras.intercept_cans[np.argmin(np.abs(Yms-0.5))]
paras.alp_GT = np.concatenate([[intercept], paras.alp_GT0])
print(f"The mean of Y is {Yms[np.argmin(np.abs(Yms-0.5))]:.3f} under intercept {intercept:.3f}.")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00,  1.48it/s]

The mean of Y is 0.435 under intercept -7.158.





In [33]:
seed = 0
N = 8
lam = 0.5
torch.set_default_tensor_type(torch.DoubleTensor)
np.random.seed(seed)
torch.manual_seed(seed)
        
_paras = edict(paras.copy())
_paras.Rv = _paras.Rfct * _paras.Rmin
_paras.R = _paras.Rfct * _paras.Rmin
_paras.seed = seed
_paras.lam = lam
_paras.N = N
_paras.basis_mat = _paras.basis_mats[_paras.can_Ns.index(N)]
_paras.Gam_GT_est = paras.Gam_GT_ests[_paras.can_Ns.index(N)]
cur_data = _gen_simu_data_all1(_paras.seed, _paras)
    
    
    
# do sure independent screening for dim reduction
if _paras.SIS_ratio < 1:
    keep_idxs, _ = SIS_GLIM(Y=cur_data.Y, X=cur_data.X, Z=cur_data.Z, 
                             basis_mat=_paras.SIS_basis_mat, 
                             keep_ratio=_paras.SIS_ratio, 
                             model_type="logi", 
                             SIS_pen=_paras.SIS_pen, 
                             sel_idx=_paras.sel_idx)
else:
    keep_idxs = _paras.sel_idx
M_idxs = np.delete(np.arange(_paras.d), _paras.sel_idx)
_paras.keep_idxs = np.sort(np.concatenate([M_idxs, keep_idxs]))
    
_paras.sel_idx_SIS = np.where(np.array([keep_idx in _paras.sel_idx for keep_idx in _paras.keep_idxs]))[0]
_paras.d_SIS = len(_paras.keep_idxs)

cur_data_SIS = edict(cur_data.copy())
cur_data_SIS.X = cur_data.X[:, _paras.keep_idxs, :]


if _paras.init_noise_sd < 0:
    alp_init = torch.zeros(_paras.q)
    Gam_init = torch.zeros(_paras.N, _paras.d_SIS)
    theta_init = torch.cat([alp_init, col_vec_fn(Gam_init)/np.sqrt(_paras.N)])
    rhok_init = torch.zeros(_paras.d_SIS*_paras.N)
else:
    alp_init = torch.Tensor(_paras.alp_GT) + torch.randn(_paras.q)*_paras.init_noise_sd
    Gam_init = torch.Tensor(_paras.Gam_GT_est[:, _paras.keep_idxs]) + torch.randn(_paras.N, _paras.d_SIS)*_paras.init_noise_sd
    theta_init = torch.cat([alp_init, col_vec_fn(Gam_init)/np.sqrt(_paras.N)])
    rhok_init = torch.randn(_paras.d_SIS*_paras.N)
    

model = LogisticModel(Y=cur_data_SIS.Y, 
                      X=cur_data_SIS.X, 
                      Z=cur_data_SIS.Z, 
                      basis_mat=_paras.basis_mat)
# 3e0
pen = SCAD(lams=_paras.lam, a=_paras.a,  sel_idx=_paras.sel_idx_SIS)
    

main_res = optimization(model=model, 
                        penalty=pen, 
                        inits=[ Gam_init, theta_init, rhok_init],
                        verbose=2,
                        alpha=_paras.alpha,
                        beta=_paras.beta, 
                        R = _paras.R,
                        stop_cv=_paras.stop_cv,
                        max_iter=_paras.max_iter,
                        is_BFGS= _paras.is_BFGS, 
                        N_eps=_paras.N_eps, 
                        N_maxit=_paras.N_maxit,
                        inner_loop_verbose=0, 
                       )



2023-12-28 17:44:36,220 - optimization_ch.opt - INFO - The paras is {'stop_cv': 0.0005, 'max_iter': 2000, 'inner_loop_verbose': 0, 'alpha': 0.9, 'beta': 1, 'R': 200000, 'N_eps': 0.0001, 'N_maxit': 100, 'is_BFGS': 'adaptive', 'linear_theta_update': 'cholesky_inv', 'linear_mat': None, 'q': 3, 'N': 8}.
  2%|██▍                                                                                                  | 49/2000 [00:01<00:59, 32.89it/s, error=0.00118, GamL0=2, CV=0.0005]


(<optimization_ch.one_step_opt.OneStepOpt at 0x7f4f2da0bdf0>, (50, 2000))

In [34]:
main_res2 = HDF_opt(X=cur_data.X, Y=cur_data.Y, Z=cur_data.Z, is_std=False, model_type="logi",
        SIS_ratio=_paras.SIS_ratio, lam=lam, N=N, sel_idx=_paras.sel_idx, opt_params={"beta":1.0, "R":_paras.R})

2023-12-28 17:44:37,739 - optimization_ch.opt - INFO - opt parmas is {'stop_cv': 0.0005, 'max_iter': 2000, 'inner_loop_verbose': 0, 'alpha': 0.9, 'beta': 1.0, 'R': 200000, 'N_eps': 0.0001, 'N_maxit': 100, 'is_BFGS': 'adaptive', 'linear_theta_update': 'cholesky_inv', 'linear_mat': None, 'is_full': False}.
2023-12-28 17:44:37,739 - optimization_ch.opt - INFO - SIS parmas is {'SIS_ratio': 0.2, 'SIS_pen': 0.02, 'SIS_basis_N': 8, 'SIS_basis_ord': 4}.
2023-12-28 17:44:37,740 - optimization_ch.opt - INFO - model parmas is {'norminal_sigma2': 1}.
2023-12-28 17:44:37,940 - optimization_ch.opt - INFO - The paras is {'stop_cv': 0.0005, 'max_iter': 2000, 'inner_loop_verbose': 0, 'alpha': 0.9, 'beta': 1.0, 'R': 200000, 'N_eps': 0.0001, 'N_maxit': 100, 'is_BFGS': 'adaptive', 'linear_theta_update': 'cholesky_inv', 'linear_mat': None, 'q': 3, 'N': 8}.


[ 5  7 26 31 35 36 37 42 47 50 58 61 67]


  2%|██▍                                                                                                  | 49/2000 [00:01<00:57, 34.05it/s, error=0.00118, GamL0=2, CV=0.0005]


(<optimization_ch.one_step_opt.OneStepOpt at 0x7f4f2d461fd0>, (50, 2000))

In [36]:
main_res, main_res2
main_res[0].Gamk- main_res2[0].Gamk 

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])