# Set up the class fundementals 

In [1]:
# import os, sys
# import collections
import numpy as _np
# import matplotlib.markers as markers
# import matplotlib.pyplot as plt
# import timeit
# import collections
# from scipy.linalg import toeplitz, block_diag
# from scipy.stats import median_abs_deviation as mad
# import multiprocessing
# import cProfile
# import itertools
from numba import jit as _jit
from numba import njit as _njit
# import warnings
# warnings.filterwarnings('ignore') # this is just to hide all the warnings
# import rpy2.robjects as robjects
# import matplotlib.pyplot as plt # change font globally to Times
# plt.style.use('ggplot')
# plt.rcParams.update({
#     "text.usetex": True,
#     "font.family": "Times New Roman",
#     "font.sans-serif": ["Times New Roman"],
#     "font.size": 12})

# os.chdir(sys.path[0]) # ensure working direcotry is set same as the file

In [2]:
######################################  some SCAD and MCP things  #######################################
@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def soft_thresholding(x, lambda_):
    '''
    To calculate soft-thresholding mapping of a given ONE-DIMENSIONAL tensor, BESIDES THE FIRST TERM (so beta_0 will not be penalized). 
    This function is to be used for calculation involving L1 penalty term later. 
    '''
    return _np.hstack((_np.array([x[0]]),
                       _np.where(
                           _np.abs(x[1:]) > lambda_,
                           x[1:] - _np.sign(x[1:]) * lambda_, 0)))


soft_thresholding(_np.random.rand(20), 3.1)


@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def SCAD(x, lambda_, a=3.7):
    '''
    To calculate SCAD penalty value;
    #x can be a multi-dimensional tensor;
    lambda_, a are scalars;
    Fan and Li suggests to take a as 3.7 
    '''
    # here I notice the function is de facto a function of absolute value of x, therefore take absolute value first to simplify calculation
    x = _np.abs(x)
    temp = _np.where(
        x <= lambda_, lambda_ * x,
        _np.where(x < a * lambda_,
                  (2 * a * lambda_ * x - x**2 - lambda_**2) / (2 * (a - 1)),
                  lambda_**2 * (a + 1) / 2))
    temp[0] = 0.  # this is to NOT penalize intercept beta later
    return temp


@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def SCAD_grad(x, lambda_, a=3.7):
    '''
    To calculate the gradient of SCAD wrt. input x; 
    #x can be a multi-dimensional tensor. 
    '''
    # here decompose x to sign and its absolute value for easier calculation
    sgn = _np.sign(x)
    x = _np.abs(x)
    temp = _np.where(
        x <= lambda_, lambda_ * sgn,
        _np.where(x < a * lambda_, (a * lambda_ * sgn - sgn * x) / (a - 1), 0))
    temp[0] = 0.  # this is to NOT penalize intercept beta later
    return temp


@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def MCP(x, lambda_, gamma):
    '''
    To calculate MCP penalty value; 
    #x can be a multi-dimensional tensor. 
    '''
    # the function is a function of absolute value of x
    x = _np.abs(x)
    temp = _np.where(x <= gamma * lambda_, lambda_ * x - x**2 / (2 * gamma),
                     .5 * gamma * lambda_**2)
    temp[0] = 0.  # this is to NOT penalize intercept beta later
    return temp


@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def MCP_grad(x, lambda_, gamma):
    '''
    To calculate MCP gradient wrt. input x; 
    #x can be a multi-dimensional tensor. 
    '''
    temp = _np.where(
        _np.abs(x) < gamma * lambda_,
        lambda_ * _np.sign(x) - x / gamma, _np.zeros_like(x))
    temp[0] = 0.  # this is to NOT penalize intercept beta later
    return temp


@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def SCAD_concave(x, lambda_, a=3.7):
    '''
    The value of concave part of SCAD penalty; 
    #x can be a multi-dimensional tensor. 
    '''
    x = _np.abs(x)
    temp = _np.where(
        x <= lambda_, 0.,
        _np.where(x < a * lambda_,
                  (lambda_ * x - (x**2 + lambda_**2) / 2) / (a - 1),
                  (a + 1) / 2 * lambda_**2 - lambda_ * x))
    temp[0] = 0.  # this is to NOT penalize intercept beta later
    return temp


@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def SCAD_concave_grad(x, lambda_, a=3.7):
    '''
    The gradient of concave part of SCAD penalty wrt. input x; 
    #x can be a multi-dimensional tensor. 
    '''
    sgn = _np.sign(x)
    x = _np.abs(x)
    temp = _np.where(
        x <= lambda_, 0.,
        _np.where(x < a * lambda_, (lambda_ * sgn - sgn * x) / (a - 1),
                  -lambda_ * sgn))
    temp[0] = 0.  # this is to NOT penalize intercept beta later
    return temp


@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def MCP_concave(x, lambda_, gamma):
    '''
    The value of concave part of MCP penalty; 
    #x can be a multi-dimensional tensor. 
    '''
    # similiar as in MCP
    x = _np.abs(x)
    temp = _np.where(x <= gamma * lambda_, -(x**2) / (2 * gamma),
                     (gamma * lambda_**2) / 2 - lambda_ * x)
    temp[0] = 0.  # this is to NOT penalize intercept beta later
    return temp


@_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def MCP_concave_grad(x, lambda_, gamma):
    '''
    The gradient of concave part of MCP penalty wrt. input x; 
    #x can be a multi-dimensional tensor. 
    '''
    temp = _np.where(
        _np.abs(x) < gamma * lambda_, -x / gamma, -lambda_ * _np.sign(x))
    temp[0] = 0.  # this is to NOT penalize intercept beta later
    return temp

# Implementation

In [3]:
# @_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def _memmap_update_smooth_grad_convex_logistic(N, p, X, beta_md, y, _dtype,
                                               _order):
    '''
    Update the gradient of the smooth convex objective component.
    '''
    _itemsize = _np.dtype(_dtype).itemsize
    # first calcualte _=X@beta_md-y
    _ = _np.zeros(N)
    if _order == "F":
        for j in _np.arange(p):
            _X = _np.memmap(X,
                            dtype=_dtype,
                            mode='r',
                            offset=j * _itemsize * N,
                            shape=(N, ))
            _ += _X * beta_md[j]
    elif _order == "C":
        for j in _np.arange(N):
            _X = _np.memmap(X,
                            dtype=_dtype,
                            mode='r',
                            offset=j * _itemsize * p,
                            shape=(p, ))
            _[j] = _X @ beta_md
    _ = _np.tanh(_ / 2.) / 2. - y + .5
    # then calculate output
    _XTXbeta = _np.zeros(p)
    if _order == "F":
        for j in _np.arange(p):
            _X = _np.memmap(X,
                            dtype=_dtype,
                            mode='r',
                            offset=j * _itemsize * N,
                            shape=(N, ))
            _XTXbeta[j] = _X @ _
    elif _order == "C":
        for j in _np.arange(N):
            _X = _np.memmap(X,
                            dtype=_dtype,
                            mode='r',
                            offset=j * _itemsize * p,
                            shape=(p, ))
            _XTXbeta += _X * _[j]
    del _
    return _XTXbeta / (2. * N)


# @_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def _memmap_update_smooth_grad_SCAD_logistic(N, p, X, beta_md, y, _lambda, a,
                                             _dtype, _order):
    '''
    Update the gradient of the smooth objective component for SCAD penalty.
    '''
    return _memmap_update_smooth_grad_convex_logistic(
        N=N, p=p, X=X, beta_md=beta_md, y=y, _dtype=_dtype,
        _order=_order) + SCAD_concave_grad(x=beta_md, lambda_=_lambda, a=a)


# @_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def _memmap_update_smooth_grad_MCP_logistic(N, p, X, beta_md, y, _lambda,
                                            gamma, _dtype, _order):
    '''
    Update the gradient of the smooth objective component for MCP penalty.
    '''
    return _memmap_update_smooth_grad_convex_logistic(
        N=N, p=p, X=X, beta_md=beta_md, y=y, _dtype=_dtype,
        _order=_order) + MCP_concave_grad(
            x=beta_md, lambda_=_lambda, gamma=gamma)


# @_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def memmap_lambda_max_logistic(X, y, N, p, _dtype, _order):
    """
    Calculate the lambda_max, i.e., the minimum lambda to nullify all penalized betas.
    """
    #     X_temp = X.copy()
    #     X_temp = X_temp[:,1:]
    #     X_temp -= _np.mean(X_temp,0).reshape(1,-1)
    #     X_temp /= _np.std(X_temp,0)
    #     y_temp = y.copy()
    #     y_temp -= _np.mean(y)
    #     y_temp /= _np.std(y)

    grad_at_0 = _memmap_update_smooth_grad_convex_logistic(
        N=N, p=p, X=X, beta_md=_np.zeros(p), y=y, _dtype=_dtype, _order=_order)
    lambda_max = _np.linalg.norm(grad_at_0[1:], ord=_np.infty)
    return lambda_max


# @_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def memmap_UAG_logistic_SCAD_MCP(design_matrix,
                                 outcome,
                                 N,
                                 p,
                                 L_convex,
                                 _dtype,
                                 _order,
                                 beta_0=_np.ones(1),
                                 tol=1e-2,
                                 maxit=500,
                                 _lambda=.5,
                                 penalty="SCAD",
                                 a=3.7,
                                 gamma=2.):
    '''
    Carry out the optimization for penalized logistic for a fixed lambda.
    '''
    X = design_matrix
    y = outcome
    _itemsize = _np.dtype(_dtype).itemsize
    if _np.all(beta_0 == _np.ones(1)):
        _ = _np.zeros(p)
        if _order == "F":
            for j in _np.arange(p):
                _X = _np.memmap(X,
                                dtype=_dtype,
                                mode='r',
                                offset=j * _itemsize * N,
                                shape=(N, ))
                _[j] = _X @ y / _np.var(_X) / len(y)
        elif _order == "C":
            for j in _np.arange(N):
                _X = _np.memmap(X,
                                dtype=_dtype,
                                mode='r',
                                offset=j * _itemsize * p,
                                shape=(p, ))
                _ += _X * y[j]
        beta = _
    else:
        beta = beta_0
    # passing other parameters
    smooth_grad = _np.ones(p)
    beta_ag = beta.copy()
    beta_md = beta.copy()
    k = 0
    converged = False
    opt_alpha = 1.
    old_speed_norm = 1.
    speed_norm = 1.
    restart_k = 0

    if penalty == "SCAD":
        #         L = _np.max(_np.array([L_convex, 1./(a-1)]))
        L = _np.linalg.norm(_np.array([L_convex, 1. / (a - 1)]), ord=_np.infty)
        opt_beta = .99 / L
        while ((not converged) or (k < 3)) and k <= maxit:
            k += 1
            if old_speed_norm > speed_norm and k - restart_k >= 3:  # in this case, restart
                opt_alpha = 1.  # restarting
                restart_k = k  # restarting
            else:  # restarting
                opt_alpha = 2 / (
                    1 + (1 + 4. / opt_alpha**2)**.5
                )  #parameter settings based on minimizing Ghadimi and Lan's rate of convergence error upper bound
            opt_lambda = opt_beta / opt_alpha  #parameter settings based on minimizing Ghadimi and Lan's rate of convergence error upper bound
            beta_md_old = beta_md.copy()  # restarting
            beta_md = (1 - opt_alpha) * beta_ag + opt_alpha * beta
            old_speed_norm = speed_norm  # restarting
            speed_norm = _np.linalg.norm(beta_md - beta_md_old,
                                         ord=2)  # restarting
            converged = (_np.linalg.norm(beta_md - beta_md_old, ord=_np.infty)
                         < tol)
            smooth_grad = _memmap_update_smooth_grad_SCAD_logistic(
                N=N,
                p=p,
                X=X,
                beta_md=beta_md,
                y=y,
                _lambda=_lambda,
                a=a,
                _dtype=_dtype,
                _order=_order)
            beta = soft_thresholding(x=beta - opt_lambda * smooth_grad,
                                     lambda_=opt_lambda * _lambda)
            beta_ag = soft_thresholding(x=beta_md - opt_beta * smooth_grad,
                                        lambda_=opt_beta * _lambda)
#             converged = _np.all(_np.max(_np.abs(beta_md - beta_ag)/opt_beta) < tol).item()
#             converged = (_np.linalg.norm(beta_md - beta_ag, ord=_np.infty) < (tol*opt_beta))
    else:
        #         L = _np.max(_np.array([L_convex, 1./(gamma)]))
        L = _np.linalg.norm(_np.array([L_convex, 1. / (gamma)]), ord=_np.infty)
        opt_beta = .99 / L
        while ((not converged) or (k < 3)) and k <= maxit:
            k += 1
            if old_speed_norm > speed_norm and k - restart_k >= 3:  # in this case, restart
                opt_alpha = 1.  # restarting
                restart_k = k  # restarting
            else:  # restarting
                opt_alpha = 2 / (
                    1 + (1 + 4. / opt_alpha**2)**.5
                )  #parameter settings based on minimizing Ghadimi and Lan's rate of convergence error upper bound
            opt_lambda = opt_beta / opt_alpha  #parameter settings based on minimizing Ghadimi and Lan's rate of convergence error upper bound
            beta_md_old = beta_md.copy()  # restarting
            beta_md = (1 - opt_alpha) * beta_ag + opt_alpha * beta
            old_speed_norm = speed_norm  # restarting
            speed_norm = _np.linalg.norm(beta_md - beta_md_old,
                                         ord=2)  # restarting
            converged = (_np.linalg.norm(beta_md - beta_md_old, ord=_np.infty)
                         < tol)
            smooth_grad = _memmap_update_smooth_grad_MCP_logistic(
                N=N,
                p=p,
                X=X,
                beta_md=beta_md,
                y=y,
                _lambda=_lambda,
                gamma=gamma,
                _dtype=_dtype,
                _order=_order)
            beta = soft_thresholding(x=beta - opt_lambda * smooth_grad,
                                     lambda_=opt_lambda * _lambda)
            beta_ag = soft_thresholding(x=beta_md - opt_beta * smooth_grad,
                                        lambda_=opt_beta * _lambda)
#             converged = _np.all(_np.max(_np.abs(beta_md - beta_ag)/opt_beta) < tol).item()
#             converged = (_np.linalg.norm(beta_md - beta_ag, ord=_np.infty) < (tol*opt_beta))
    return k, beta_md


# @_jit(nopython=True, cache=True, parallel=True, fastmath=True, nogil=True)
def memmap_solution_path_logistic(design_matrix,
                                  outcome,
                                  lambda_,
                                  L_convex,
                                  N,
                                  p,
                                  beta_0=_np.ones(1),
                                  tol=1e-2,
                                  maxit=500,
                                  penalty="SCAD",
                                  a=3.7,
                                  gamma=2.,
                                  _dtype='float32',
                                  _order="F"):
    '''
    Carry out the optimization for the solution path without the strong rule.
    '''
    beta_mat = _np.zeros((len(lambda_) + 1, p))
    for j in range(len(lambda_)):
        beta_mat[j + 1, :] = memmap_UAG_logistic_SCAD_MCP(
            design_matrix=design_matrix,
            outcome=outcome,
            N=N,
            p=p,
            L_convex=L_convex,
            beta_0=beta_mat[j, :],
            tol=tol,
            maxit=maxit,
            _lambda=lambda_[j],
            penalty=penalty,
            a=a,
            gamma=gamma,
            _dtype=_dtype,
            _order=_order,
        )[1]
    return beta_mat[1:, :]

# Testing

In [4]:
import numpy as np
import matplotlib.markers as markers
import matplotlib.pyplot as plt
import timeit
from scipy.linalg import toeplitz, block_diag
from tempfile import mkdtemp
import os.path as path

# this cell is for profiling the function
np.random.seed(0)
N = 1000
SNR = 5.
true_beta = np.array([.5, -.5, .8, -.8] + [0] * 2000)
X_cov = toeplitz(.5**np.arange(2004))
mean = np.zeros(true_beta.shape[0])
X = np.random.multivariate_normal(mean, X_cov, N)
X -= np.mean(X, 0).reshape(1, -1)
X /= np.std(X, 0)
intercept_design_column = np.ones(N).reshape(N, 1)
X_sim = np.concatenate((intercept_design_column, X), 1)
true_sigma_sim = np.sqrt(true_beta.T @ X_cov @ true_beta / SNR)
true_beta_intercept = np.concatenate((np.array([0.5]), true_beta))
signal = X_sim @ true_beta_intercept + np.random.normal(0, true_sigma_sim, N)
y_sim = np.random.binomial(1, np.tanh(signal / 2) / 2 + .5)

L_convex = 1 / N * (np.linalg.eigvalsh(X_sim @ X_sim.T)[-1])

filename = path.join(mkdtemp(), 'newfile.dat')
fp = np.memmap(filename,
               dtype='float32',
               mode='w+',
               shape=(N, 2005),
               order="F")
fp[:] = X_sim[:]
fp.flush()

fit2 = memmap_solution_path_logistic(design_matrix=filename,
                                     outcome=y_sim,
                                     tol=1e-2,
                                     maxit=500,
                                     lambda_=np.linspace(.005, .08, 60)[::-1],
                                     penalty="SCAD",
                                     a=3.7,
                                     gamma=2.,
                                     N=N,
                                     p=2005,
                                     L_convex=L_convex,
                                     _dtype="float32",
                                     _order="F")
print(fit2)

[[4.18288650e-01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.19652111e-01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.20959441e-01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [4.50533892e-01 4.79884580e-02 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.51245653e-01 5.41205710e-02 0.00000000e+00 ... 3.57689290e-04
  0.00000000e+00 0.00000000e+00]
 [4.52171424e-01 5.98629358e-02 0.00000000e+00 ... 1.09629559e-03
  0.00000000e+00 0.00000000e+00]]
