test the CV for logi

In [1]:
import sys
sys.path.append("../../mypkg")

In [2]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from numbers import Number

from easydict import EasyDict as edict
from tqdm import trange, tqdm
from scipy.io import loadmat
from pprint import pprint
import itertools
from scipy.stats import chi2


In [3]:
# This will reload all imports as soon as the code changes
%load_ext autoreload
%autoreload 2

In [52]:
from constants import DATA_ROOT, RES_ROOT, FIG_ROOT, MIDRES_ROOT
from default_paras import def_paras

from hdf_utils.data_gen import gen_covs, gen_simu_psd, gen_simu_ts
from hdf_utils.fns import fn1, fn2, fn3, fn4, fn5, zero_fn
from hdf_utils.fns_sinica import coef_fn, fourier_basis_fn, gen_sini_Xthetas
from hdf_utils.likelihood import obt_lin_tm
from hdf_utils.SIS import SIS_linear, SIS_ballcor
from hdf_utils.utils import gen_lam_seq
from hdf_utils.hypo_test import  MS2idxs, obt_test_stat_simple2
from utils.matrix import col_vec_fn, col_vec2mat_fn, conju_grad, svd_inverse, cholesky_inv
from utils.functions import logit_fn
from utils.misc import save_pkl, load_pkl, bcross_entropy_loss
from splines import obt_bsp_obasis_Rfn, obt_bsp_basis_Rfn_wrapper
from projection import euclidean_proj_l1ball
from optimization.one_step_opt import OneStepOpt
from optimization.cross_validation import CV_err_linear_fn
from optimization.opt import optimization
from penalties.scad_pen import SCAD
from models.linear_model import LinearModel
from models.logistic_model import LogisticModel
from optimization.variable_selection import GIC_fn, GCV_fn

from joblib import Parallel, delayed

In [5]:
plt.style.use(FIG_ROOT/"base.mplstyle")
torch.set_default_tensor_type(torch.DoubleTensor)

# Param and fns

## Params

In [10]:
np.random.seed(0)
#obt_bsp = obt_bsp_obasis_Rfn
obt_bsp = obt_bsp_basis_Rfn_wrapper
paras = edict(def_paras.copy())



# Others
paras.num_rep = 200 
paras.init_noise_sd = -1 # the sd of the noise added to the true value for initial values, if -1, make init 0
paras.SIS_ratio = 0.20 # the ratio to keep with SIS procedure
paras.linear_theta_update="cholesky_inv"

# candidate sets of tuning parameters, only two 
# lambda: penalty term
# N: num of basis
paras.can_lams = [0.01, 0.1, 0.2, 0.3, 0.4, 0.6, 1, 2, 8] # for non
paras.can_lams = [0.001, 0.3, 0.6, 0.8, 1, 1.2, 1.4, 2, 16] # for orthogonal basis
paras.can_Ns = [4, 6, 8, 10, 12]


# generating dataset
paras.n = 200 # num of data obs to be genareted
paras.npts = 100 # num of pts to evaluate X(s)
paras.d = 120 # num of ROIs
paras.q = 3 # num of other covariates
paras.types_ = ["int", "c", 2]
paras.srho = 0.3 # corr from sinica

# b-spline
paras.x = np.linspace(0, 1, paras.npts)
paras.basis_mats = []
for N in paras.can_Ns:
    paras.basis_mats.append(
        torch.tensor(obt_bsp(paras.x, N, paras.ord)).to(torch.get_default_dtype())
    )

# True parameters
paras.alp_GT = np.array([0, -1, 1])
# fourier basis
cs = [0.0, 0.0, 0.0] # for sinica paper
paras.fourier_basis = fourier_basis_fn(paras.x)[:, :]
paras.fourier_basis_coefs = ([cs[0]*coef_fn(0.2), cs[1]*coef_fn(0.2), cs[2]*coef_fn(0.2)] + 
                             [np.zeros(50)] * (paras.d-3-1) +
                             [coef_fn(0.2)]
                             )
paras.fourier_basis_coefs = np.array(paras.fourier_basis_coefs).T 
paras.beta_GT = paras.fourier_basis @ paras.fourier_basis_coefs
print(np.linalg.norm(paras.beta_GT, axis=0))

paras.Gam_GT_ests = [(np.linalg.inv(basis_mat.numpy().T 
                                  @ basis_mat.numpy()) 
                                  @ basis_mat.numpy().T 
                                  @ paras.beta_GT) 
                     for basis_mat in paras.basis_mats]

# optimization
# not used, to use it, you have to know GT
#Rmins = [(2*(np.linalg.norm(paras.Gam_GT_ests[ix]
#                            /np.sqrt(paras.can_Ns[ix]), axis=0).sum() 
#           + np.abs(paras.alp_GT).sum())) 
#        for ix in range(len(paras.can_Ns))]
#paras.Rmin = np.max(Rmins)
paras.Rmin = 100000
paras.Rfct = 2
paras.stop_cv = 5e-4
paras.max_iter = 10000
paras.num_cv_fold = 5

paras.NR_eps = 1e-4 # the stop criteria for Newton-Ralpson method, only for logistic model
paras.NR_maxit = 100

paras.cv_is_center = True
paras.cv_SIS_ratio = 0.2
paras.num_cv_fold = 5
paras.cv_init_noise_sd = -1


# hypothesis test
#without loss of generality, we assume the idxs in M is the first m betas
paras.sel_idx = np.arange(1, paras.d) # M^c set, 
paras.M_idxs = np.delete(np.arange(paras.d), paras.sel_idx) # the M set
paras.Cmats = [
    np.eye(len(paras.M_idxs)), # m x m I matrix, [beta1, beta2] = [0, 0]
]
paras.svdinv_eps_Q = 1e-7 # now 0 means inverse, small value like 0.01 means remove small eig vals.
paras.svdinv_eps_Psi = 1e-7

paras.save_dir = RES_ROOT/"simu_linear_sinica_samebetaX_tmp"
if not paras.save_dir.exists():
    paras.save_dir.mkdir()

[ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.         

## Fns

In [16]:
def _gen_simu_data_all(seed, paras, verbose=False):
    """
    Generate simulated data for all parameters.

    Args:
        seed (int): Seed for random number generator.
        paras (dict): Dictionary containing the following parameters:
            - srho: corr from sinica
            - fourier_basis: The fourier basis for generating X, npts x nbasis
            - n (int): Number of samples.
            - d (int): Number of dimensions.
            - q (int): Number of covariates.
            - types_ (list): List of types for generating covariates.
            - alp_GT (list): List of ground truth alpha values.
            - beta_GT (list): List of ground truth beta values.
            - freqs (list): List of frequencies for generating simulated PSD.
            - sigma2 (float): Variance of the noise.
        verbose(bool): Verbose or not

    Returns:
        all_data (dict): Dictionary containing the following simulated data:
            - X (torch.Tensor): Tensor of shape (n, d, npts) containing the simulated PSD.
            - Y (torch.Tensor): Tensor of shape (n,) containing the response variable.
            - Z (torch.Tensor): Tensor of shape (n, q) containing the covariates.
    """
    torch.set_default_tensor_type(torch.DoubleTensor)
    np.random.seed(seed)
    _paras = edict(paras.copy())
    # simulated PSD
    assert len(_paras.types_) == _paras.q
    assert len(_paras.alp_GT) == _paras.q
    con_idxs = [typ =="c" for typ in _paras.types_]
   
    thetas = gen_sini_Xthetas(_paras.srho, _paras.n, _paras.d);
    simu_curvs = thetas @ _paras.fourier_basis.T; # n x d x npts
    #simu_curvs = np.random.randn(_paras.n, _paras.d, _paras.npts)*10
    simu_covs = gen_covs(_paras.n, _paras.types_)
    
    # linear term and Y
    int_part = np.sum(_paras.beta_GT.T* simu_curvs[:, :, :], axis=1).mean(axis=1)
    cov_part = simu_covs @ _paras.alp_GT 
    
    # linear term
    lin_term = cov_part + int_part
    
    probs = logit_fn(lin_term)
    
    # Y 
    Y = np.random.binomial(1, probs, size=len(probs))
    
    X_centered = simu_curvs - simu_curvs.mean(axis=0, keepdims=True)
    # this step is not necessary for simulation as I did so in generating data step
    # but for real data, plz do this
    Z_std = simu_covs.copy()
    Z_std[:, con_idxs] = (simu_covs[:, con_idxs] - simu_covs[:, con_idxs].mean(axis=0, keepdims=True))/simu_covs[:, con_idxs].std(axis=0, keepdims=True)
    
    # To torch
    X = torch.Tensor(X_centered) # n x d x npts
    Z = torch.Tensor(Z_std) # n x q
    Y = torch.Tensor(Y)
    
    all_data = edict()
    all_data.X = X
    all_data.Y = Y
    all_data.Z = Z
    all_data.lin_term = lin_term
    return all_data

In [33]:
def CV_err_logi_fn(data, num_cv_fold, is_prg=False, save_paras=False, input_paras={}):
    """This fn is to do the cross validation for select parameters for the optimization procedure 
       for logistic model, also include SIS in CV
        args:
            data: The dataset including, Y, Z, X
            num_cv_fold: Num of cross validation folds.
            input_paras: other parameters
    """
    
    # default parameter
    _paras = {
        "is_small": True, 
        "Rv": None, 
        "sigma2": 1,
        "basis_mat": None,
        'alpha': 0.9,
        'beta': 1,
        'NR_eps': 1e-05,
        'NR_maxit': 100,
        'stop_cv': 0.0005,
        'max_iter': 2000, 
        'cv_is_center': False,
        'cv_SIS_ratio': 0.2, 
        'cv_init_noise_sd': -1
    }
    _paras = edict(_paras)
    _paras.update(input_paras)
    
    _paras.n = data.Y.shape[0]
    _, N = paras.basis_mat.shape
    
    #We std continous variable
    con_idxs = [typ == 'c' for typ in paras.types_]
    
    
    num_test = int(_paras.n/num_cv_fold)
    full_idx = np.arange(_paras.n)
    test_Y_prob_all = []
    if is_prg:
        prg_bar = trange(num_cv_fold)
    else:
        prg_bar = range(num_cv_fold)
    for ix in prg_bar:
        test_idx = full_idx[(ix*num_test):(ix*num_test+num_test)]
        if ix == num_cv_fold-1:
            test_idx = full_idx[(ix*num_test):] # including all remaining data
        train_idx = np.delete(full_idx, test_idx)
        
        test_set_X = data.X[test_idx]
        test_set_Y = data.Y[test_idx]
        test_set_Z = data.Z[test_idx]
        
        train_set_X = data.X[train_idx]
        train_set_Y = data.Y[train_idx]
        train_set_Z = data.Z[train_idx]
        
        if _paras.cv_is_center:
            test_set_X = test_set_X - train_set_X.mean(axis=0, keepdims=True)
            test_set_Z[:, con_idxs] = (test_set_Z[:, con_idxs] - train_set_Z[:, con_idxs].mean(axis=0, keepdims=True))/train_set_Z[:, con_idxs].std(axis=0, keepdims=True)
            
            train_set_X = train_set_X - train_set_X.mean(axis=0, keepdims=True)
            train_set_Z[:, con_idxs] = (train_set_Z[:, con_idxs] - train_set_Z[:, con_idxs].mean(axis=0, keepdims=True))/train_set_Z[:, con_idxs].std(axis=0, keepdims=True)
            
        # SIS step
        if _paras.cv_SIS_ratio < 1:
            keep_idxs, _ = SIS_ballcor(train_set_Y, train_set_X, _paras.sel_idx, _paras.cv_SIS_ratio)
        else:
            keep_idxs = _paras.sel_idx
        M_idxs = np.delete(np.arange(_paras.d), _paras.sel_idx)
        keep_idxs = np.sort(np.concatenate([M_idxs, keep_idxs]))
            
        sel_idx_SIS = np.where(np.array([keep_idx in _paras.sel_idx for keep_idx in keep_idxs]))[0]
        d_SIS = len(keep_idxs)
        pen = SCAD(lams=_paras.lam, a=_paras.a,  sel_idx=sel_idx_SIS)
        
        train_set_X = train_set_X[:, keep_idxs]
        test_set_X = test_set_X[:, keep_idxs]
        
        # initial value
        if _paras.cv_init_noise_sd < 0:
            alp_init = torch.zeros(_paras.q)
            Gam_init = torch.zeros(N, d_SIS)
            theta_init = torch.cat([alp_init, col_vec_fn(Gam_init)/np.sqrt(N)])
            rhok_init = torch.zeros(d_SIS*N) 
        else:
            alp_init = torch.Tensor(_paras.alp_GT) + torch.randn(_paras.q)*_paras.init_noise_sd
            Gam_init = torch.Tensor(_paras.Gam_GT_est[:, keep_idxs]) + torch.randn(N, d_SIS)*_paras.init_noise_sd
            theta_init = torch.cat([alp_init, col_vec_fn(Gam_init)/np.sqrt(N)])
            rhok_init = torch.randn(d_SIS*N)
        
    
        cur_model = LogisticModel(Y=train_set_Y, 
                                  X=train_set_X, 
                                  Z=train_set_Z, 
                                  basis_mat=_paras.basis_mat)
        res = optimization(model=cur_model, 
                           penalty=pen, 
                           inits=[alp_init, Gam_init, theta_init, rhok_init], 
                           is_prg=False,
                           save_paras=False,
                           input_paras=_paras)
        alp_est = res[0].alpk
        gam_est = res[0].Gamk
        test_Y_est = obt_lin_tm(test_set_Z, test_set_X, alp_est, gam_est, _paras.basis_mat)
        test_Y_prob_all.append(logit_fn(test_Y_est.numpy()))
    test_Y_prob_all = np.concatenate(test_Y_prob_all)
    if save_paras:
        return test_Y_prob_all, _paras
    else:
        return test_Y_prob_all


In [34]:
data = _gen_simu_data_all(0, paras);

In [None]:
paras.lam = 0.1
paras.basis_mat = paras.basis_mats[1]
paras.Rv = paras.Rfct * paras.Rmin
res = CV_err_logi_fn(data, num_cv_fold=5, is_prg=True, save_paras=True, input_paras=paras)

In [59]:
bcross_entropy_loss(res[0], data.Y.numpy())

0.5551750496718584

In [62]:
probs = res[0]
y = data.Y.numpy()
loss = -(y*np.log(probs) + (1-y)*np.log(1-probs))
np.sort(loss)

array([0.00472153, 0.00799697, 0.00816439, 0.01334131, 0.02961459,
       0.03047471, 0.03258976, 0.04519336, 0.0480683 , 0.05107867,
       0.05217791, 0.05267721, 0.05762995, 0.05892226, 0.05908698,
       0.05976103, 0.0615461 , 0.06290772, 0.06449015, 0.0667923 ,
       0.06962228, 0.07071257, 0.07273082, 0.08131482, 0.0863873 ,
       0.08738386, 0.09360772, 0.09438288, 0.09772966, 0.10058314,
       0.10181476, 0.10417931, 0.1069586 , 0.10780388, 0.11326041,
       0.11473099, 0.11635528, 0.1182939 , 0.11859332, 0.12696471,
       0.12844354, 0.1291101 , 0.13034337, 0.1311781 , 0.14417465,
       0.14417658, 0.14466161, 0.14496064, 0.14738013, 0.14868348,
       0.15146195, 0.15415498, 0.15550805, 0.15621734, 0.1568475 ,
       0.16116373, 0.16764059, 0.17229558, 0.1774315 , 0.1775114 ,
       0.18482441, 0.18639062, 0.18979778, 0.19778805, 0.20546656,
       0.21358749, 0.21782472, 0.23426547, 0.23494848, 0.23683035,
       0.23762993, 0.2385621 , 0.24093869, 0.24167679, 0.24511

In [55]:
import scipy
scipy.stats.pearsonr(res[0], data.Y.numpy())

PearsonRResult(statistic=0.5035674853931045, pvalue=2.952285052997972e-14)

In [12]:
data.Z.shape

torch.Size([200, 1])

In [28]:
pvals = np.array([res["pval"] for res in ress])
L0s = np.array([torch.sum(res["est_Gam"].norm(dim=0)!=0) for res in ress]);
print(np.mean(pvals<0.05), L0s.mean())


0.02 3.8
