This is based on Fabian Pedregosas [blog](https://fa.bianp.net/blog/2013/logistic-ordinal-regression/) and deprecated [github](https://github.com/fabianp/minirank/blob/master/minirank/logistic.py)

In [3]:
import numpy as np
from numpy.random import default_rng
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from scipy import optimize, linalg, sparse

In [4]:
rng = default_rng(1234)

# Simulation Data

Use a subsample of the simulation data for testing


In [5]:
from masterthesis.data import load_h5ad, load_acinar
data = load_h5ad("/home/julian/Uni/MasterThesis/data/simdata.h5ad")

In [6]:
idx = data.var['Setting'] == "TS"  # extract only the time series samples
sim_X = data.X[:, idx]
sim_y = data.obs["Ordinal_Time_Labels"]

In [7]:
# random subsample genes
y_idx = rng.choice(np.arange(sim_y.size), size=sim_y.size // 2, replace=False)
x_idx = rng.choice(np.arange(sim_X.shape[1]), size=sim_X.shape[1] // 4, replace=False)
sim_X = sim_X[y_idx, :]
sim_X = sim_X[:, x_idx]
sim_y = sim_y[y_idx]

In [8]:
scaler = StandardScaler()
sim_X = scaler.fit_transform(sim_X)

In [9]:
print(sim_X.shape)
print(sim_y.shape)

(286, 1031)
(286,)


# Acinar Data

In [10]:
acinar_ann = load_acinar()

In [11]:
# selected Genes after preprocessing in R
sel_genes = ["REG3A", "AMY2A", "MT2A", "OLFM4",
             "SYCN", "CELA2B", "FGL1", "AMY2B",
             "MT1G", "TM4SF1", "CELA2A", "PDK4", 
             "TACSTD2", "CD44", "PNLIPRP2", "ALB", 
             "ERP27", "LDHA", "REG3G", "CTRL", "CLPS",
             "FOS", "HSPA8", "SERPINA3", "CELA3B", "CRP"]

In [12]:
from sklearn.model_selection import train_test_split

ac_y = np.array([int(x) for x in acinar_ann.obs.donor_age])
ac_label_conv = dict(zip(np.unique(ac_y), range(len(ac_y))))
ac_y = np.array([ac_label_conv[l] for l in ac_y])
k = len(np.unique(ac_y))

ac_X_train, ac_X_test, ac_y_train, ac_y_test = train_test_split(acinar_ann[:,sel_genes].X, ac_y, 
                                                                test_size=0.1, 
                                                                stratify=ac_y,
                                                                random_state=1234)

In [13]:
scaler = StandardScaler()
ac_X_train = scaler.fit_transform(ac_X_train)
ac_X_test = scaler.fit_transform(ac_X_test)

## Gradient DIY

In [85]:
BIG = 1e10
SMALL = 1e-6

def phi(beta, thresholds, X):
    phi = 1 / (1 + np.exp(beta.T @ X - thresholds))
    return phi

def l1_regularizer(beta, l):
    return l * np.sum(np.abs(beta))

# PROBLEM: Is the objective minimized or maximized?? -> Scipy MINIMIZES
# PROBLEM: What if diff is negative? -> Log has issues, but value may be meaningful
# PROBLEM 2: What if diff is too small? -> Log will explode
# PROBLEM 3: Loss of precision?
# log likelihood with regularization as objective function
def objective(params, X, y, k, lamb=0.1, scale_by_y=True):
    
    beta = params[:-k]
    theta = params[-k:]
    thresholds = np.array([theta[i] for i in y])
    thresholds_m1 = np.array([theta[max(0, i-1)] for i in y]) 

    # fit term
    idx = (thresholds > 0)
    diff = np.zeros_like(thresholds)
    phi_i = phi(beta, thresholds, X)
    phi_im1 = phi(beta, thresholds_m1, X)
    
    diff[idx] = (phi_i - phi_im1)[idx]
    diff[~idx] = phi_i[~idx]

    # cut off the difference at a minimum to avoid PROBLEM 2
    #if diff.min() < SMALL:
    #    diff = np.maximum(diff, SMALL)
    
    loss = np.log(diff, out=np.zeros_like(diff), where=diff>0)
    
    loss = np.sum(loss)

    # regularization term
    loss += l1_regularizer(beta, lamb)
    
    # scale the loss with the inverse number of samples to handle PROBLEM 3
    if scale_by_y:
        loss *= (1 / y.size)

    return loss

def objective_grad(params, X, y, k, lamb=0.1, scale_by_y=True):
    beta = params[:-k]
    theta = params[-k:]
    thresholds = np.array([theta[i] for i in y])
    thresholds_m1 = np.array([theta[max(0, i-1)] for i in y])

    # PROBLEM: thresholds_m1 - thresholds can be 0 -> then the denominator becomes 0!
    idx = (thresholds > 0)
    diff = np.zeros_like(thresholds)
    phi_i = phi(beta, thresholds, X)
    phi_im1 = phi(beta, thresholds_m1, X)
    
    diff[idx] = (phi_i - phi_im1)[idx]
    diff[~idx] = phi_i[~idx]
    
    # BETA UPDATE
    beta_grad = np.sum(X * (1 - diff), axis=1)
    
    # derivative of regularizer over beta scaled
    reg_sc = 1 / beta.size
    beta_grad += np.sum(np.abs(beta)) * lamb * reg_sc
        
    # THETA UPDATE
    # first half of the gradient
    e = np.identity(k)
    e_expanded = np.concatenate([e[i] for i in y]).reshape(y.size, k).T
    
    temp = (1 
            - phi(beta, thresholds, X) 
            - np.exp(np.log(1 - (1 - np.exp(diff)))))
    t1 = e_expanded @ temp
    
    # second half of the gradient
    e_m1 = np.identity(k+1)[1:,:-1]  # identity with diagonal shifty up by one
    e_m1_expanded = np.concatenate([e_m1[i] for i in y]).reshape(y.size, k).T
    temp = (1 
            - phi(beta, thresholds_m1, X) 
            - np.exp(np.log(1 - (1 - np.exp(-1 * (diff))))))
    
    t2 = e_m1_expanded @ temp
    
    theta_grad = t1 + t2
    
    # scaling
    if scale_by_y:
        theta_grad = theta_grad * (1/y.size)
        beta_grad = beta_grad * (1/y.size)

    return np.concatenate([beta_grad, theta_grad])

In [71]:
from scipy.optimize import minimize

def train(obj, grad, X, y, lamb=0.9, scaling=False, method="BFGS"):
    # flip such that X -> (genes, cells)
    if X.shape[0] == y.size:
        X = X.T
    
    n_classes = np.unique(y).size
    params = np.zeros(X.shape[0] + n_classes)
    
    m = minimize(obj, params_nm, args=(X, y, n_classes, lamb, scaling), jac=grad, method=method)
    print(m)
    print(m.x)

In [91]:
train(objective, objective_grad, ac_X_train, ac_y_train, lamb=0.1, scaling=True, method="bfgs")

  message: Desired error not necessarily achieved due to precision loss.
  success: False
   status: 2
      fun: -2.845559588887395
        x: [ 1.180e-01  2.398e-02 ...  6.945e-02  1.042e-01]
      nit: 1
      jac: [ 7.387e-05 -5.406e-03 ... -1.175e-01 -4.962e-02]
 hess_inv: [[ 1.007e+00  4.210e-03 ...  6.452e-02  3.172e-02]
            [ 4.210e-03  1.001e+00 ...  1.475e-02  8.905e-03]
            ...
            [ 6.452e-02  1.475e-02 ...  1.073e+00  7.200e-02]
            [ 3.172e-02  8.905e-03 ...  7.200e-02  1.051e+00]]
     nfev: 53
     njev: 49
[ 0.11802656  0.02397907 -0.06843315  0.08290751 -0.13818964 -0.12356077
  0.09176323  0.03192918 -0.06979415  0.06221448 -0.10482473  0.03492728
  0.04466238  0.12867758 -0.1010722   0.03643623 -0.07947574  0.06614485
 -0.01999933 -0.12323364 -0.11978792  0.01439815  0.0726311   0.12125875
 -0.12774478  0.0258153   0.06652065  0.1978226   0.17345736  0.25058858
  0.04736112  0.21428774  0.06944616  0.10423809]


In [44]:
from scipy.optimize import minimize

scale_by_y = False
lamb = 0.9
n_classes = np.unique(ac_y_train).size
params = np.zeros(ac_X_train.shape[1] + n_classes)

# random params
params_rand = rng.normal(0, 0.1, ac_X_train.shape[1] + n_classes)

# params after one run of Nelder-Mead 
params_nm = [2.99552789e-17, -9.12238797e-17,  8.43373503e-17,  6.28923016e-17,
        1.29517064e-16,  2.35470728e-16,  4.64152560e-18, -2.37379072e-17,
        9.33068062e-17, -1.31213310e-16,  8.47381150e-17, -9.14277033e-18,
       -1.10647317e-16, -5.55542955e-17,  4.15674610e-17, -7.28795221e-17,
        1.25000000e-04, -3.91915815e-17,  1.01399459e-16,  2.16536932e-16,
        6.13107712e-15,  1.23078388e-16,  1.16260966e-16,  1.76600273e-17,
        1.35598036e-16,  7.26746491e-17,  1.25000000e-04,  1.30296841e-16,
        1.45441669e-16, -1.72678216e-17, -1.87164751e-16, -4.54465550e-17,
       -1.00882237e-16, -3.29592381e-17]

params_warm =  [0.47077975,  0.        ,  0.37275445,  0.        ,  0.        ,
        -0.2718176 ,  0.01834662,  0.27956828,  0.        ,  0.00906079,
         0.        ,  0.22041753, -0.1378634 ,  0.18726233, -0.08640451,
         0.08268969, -0.26427331,  0.        , -0.28873376, -0.24587408,
         0.        , -0.07656282, -0.02134246,  0.59662556,  0.05288458,
         0.        ,  1.71402649,  0.10353958,  0.        , 
        -0.48783945, -0.62012817, -1.75287372, -2.09708938, 4]

m = minimize(objective, params_nm, args=(ac_X_train.T, ac_y_train, n_classes, lamb, scale_by_y),
         method="BFGS")
print(m)
print(m.x)

  message: Desired error not necessarily achieved due to precision loss.
  success: False
   status: 2
      fun: -2286.831124850377
        x: [ 4.881e-14 -2.820e-06 ...  2.298e-05  4.597e-05]
      nit: 1
      jac: [ 3.971e+01 -1.695e+01 ... -1.393e+06  1.392e+06]
 hess_inv: [[ 1.000e+00  1.123e-11 ... -9.156e-11 -1.831e-10]
            [ 1.123e-11  9.997e-01 ...  2.741e-03  5.482e-03]
            ...
            [-9.156e-11  2.741e-03 ...  9.777e-01 -4.468e-02]
            [-1.831e-10  5.482e-03 ... -4.468e-02  9.106e-01]]
     nfev: 2142
     njev: 61
[ 4.88092707e-14 -2.81968583e-06 -2.98211487e-14  3.40207020e-14
 -2.81968590e-06 -2.81968589e-06 -2.81968580e-06 -5.63937166e-06
 -5.63937170e-06 -2.81968581e-06 -5.63937172e-06  1.37062171e-14
 -2.81968582e-06 -2.81968578e-06 -2.81968588e-06 -5.63937166e-06
  1.19360628e-04  2.68464853e-14 -2.81968585e-06 -2.81968589e-06
 -2.81968588e-06 -5.63937167e-06 -5.63937164e-06 -5.63937162e-06
 -2.81968589e-06 -2.81968583e-06  1.25000000e-0

### Observations:

- Numerical instabilities (nans, infs): Especially BFGS struggles here and usually only executes one iteration
- BFTS struggles to converge, starting from all-0 parameters.
- Nelder-Mead works without derivative

- None of the attempts introduce sparsity!
- Using pretrained parameters (params_warm), all attempted solvers converge successfully

In [95]:
# SGD and our object function

# training hyperparams
eps = 0.0001
dloss = 1
cur_iter = 0
max_iter = 100
learning_rate = 0.01

# objective / grad hyperparams
scaling = False
lamb = 0.1

# data 
X = ac_X_train.T
y = ac_y_train

# weights and data params
n_classes = np.unique(y).size
w = np.zeros(X.shape[0] + n_classes)

losses = [12345]
while (dloss > eps) and (cur_iter <= max_iter):
    cur_iter += 1

    if cur_iter % 50 == 0:
        print("Iter", cur_iter)
        print("Loss:",losses[-1])
        print("Weights:", w)
        
    losses.append(objective(w, X, y, n_classes, lamb, scale_by_y))
    grad = objective_grad(w, X, y, n_classes, lamb, scale_by_y)
    
    w -= learning_rate * grad
    
    dloss = np.abs(losses[-2] - losses[-1])

Iter 0
Loss: 12345
Weights: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Iter 50
Loss: -2093.3756390108347
Weights: [-12.0354645    2.0764663    4.14784512  -5.36996877   9.10163742
  11.27836069  -6.31736415  -2.25134098   5.06562685   3.87631968
   6.5435978   -3.40019983   8.04123862  -2.23485703  10.42959417
  -0.44177098   1.61206272  -1.54336137  -8.43191276   8.46552525
   5.12071742  -0.57297819  -4.89048261  -2.91766763   8.16922764
  -1.55641335  19.00705742  76.45716448  -3.20447185  50.96407881
  36.05508629  12.12291515  30.05583119  21.12252796]


## Use Jax autograd

In [14]:
from jax import grad, jit, vmap
from jax import numpy as jnp

In [15]:
LMBD = 0.5
BIG = 1e10
X = ac_X_train
y = ac_y_train
k = jnp.unique(ac_y_train).size

def jphi(beta, thresholds):
    phi = 1 / (1 + jnp.exp(jnp.dot(beta.T, X) - thresholds))
    return phi

def jl1_regularizer(beta):
    return LMBD * jnp.sum(jnp.abs(beta))

# PROBLEM: Is the objective minimized or maximized?? -> Scipy MINIMIZES
# log likelihood with regularization as objective function
def jobjective(params):
    beta = params[:-k]
    theta = params[-k:]
    thresholds = jnp.array([theta[i] for i in y])
    thresholds_m1 = jnp.array([theta[max(0, i-1)] for i in y]) 

    # fit term
    # PROBLEM: What if diff is negative? -> Log has issues, but value may be meaningful
    idx = (thresholds != 0)
    diff = jnp.zeros_like(thresholds)
    phi_i = jphi(beta, thresholds)
    phi_im1 = jphi(beta, thresholds_m1)
    diff.at[idx].set((phi_i - phi_im1)[idx])
    diff.at[~idx].set(phi_i[~idx])

    loss = jnp.log(diff)
    loss = jnp.nan_to_num(jnp.log(diff), nan=0)
    loss = jnp.sum(loss)

    # regularization term
    loss -= jl1_regularizer(beta)
    
    # scipy MINIMIZES the loss
    loss *= -1

    #print("avg loss:", loss.mean())
    return loss


No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


In [19]:
from scipy.optimize import minimize

LMBD = 0.01
BIG = 1e10
X = ac_X_train.T
y = ac_y_train
k = jnp.unique(ac_y_train).size
params = np.zeros(ac_X_train.shape[1] + k)

params_ones = np.ones(ac_X_train.shape[1] + k)

params_warm_start = [0.47077975,  0.        ,  0.37275445,  0.        ,  0.        ,
        -0.2718176 ,  0.01834662,  0.27956828,  0.        ,  0.00906079,
         0.        ,  0.22041753, -0.1378634 ,  0.18726233, -0.08640451,
         0.08268969, -0.26427331,  0.        , -0.28873376, -0.24587408,
         0.        , -0.07656282, -0.02134246,  0.59662556,  0.05288458,
         0.        ,  1.71402649,  0.10353958,  0.        , 0,  -0.48783945,
        -0.62012817, -1.75287372, -2.09708938]



objective_jaxder = grad(jobjective)
minimize(jobjective, params, jac=objective_jaxder, method="BFGS")

  alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)
  alpha1 = min(1.0, 1.01*2*(phi0 - old_phi0)/derphi0)


  message: Desired error not necessarily achieved due to precision loss.
  success: False
   status: 2
      fun: inf
        x: [ 0.000e+00  0.000e+00 ...  0.000e+00  0.000e+00]
      nit: 0
      jac: [ 1.000e-02  1.000e-02 ...  0.000e+00  0.000e+00]
 hess_inv: [[1 0 ... 0 0]
            [0 1 ... 0 0]
            ...
            [0 0 ... 1 0]
            [0 0 ... 0 1]]
     nfev: 18
     njev: 7

## Gradient Model v3 

Based on [mord by fabian](https://github.com/fabianp/mord/blob/master/mord/threshold_based.py)

In [None]:
http://localhost:8888/?token=3783460b6309e1f75cfe99b263c1bf4fe50b5d7d8fd5bc66
from sklearn.utils.validation import check_X_y

def sigmoid(t):
    # sigmoid function, 1 / (1 + exp(-t))
    # stable computation
    idx = t > 0
    out = np.zeros_like(t)
    out[idx] = 1. / (1 + np.exp(-t[idx]))
    exp_t = np.exp(t[~idx])
    out[~idx] = exp_t / (1. + exp_t)
    return out


def log_loss(Z):
    # stable computation of the logistic loss
    idx = Z > 0
    out = np.zeros_like(Z)
    out[idx] = np.log(1 + np.exp(-Z[idx]))
    out[~idx] = (-Z[~idx] + np.log(1 + np.exp(Z[~idx])))
    return out


def obj_margin(x0, X, y, alpha, n_class, weights, L, sample_weight):
    """
    Objective function for the general margin-based formulation
    """

    w = x0[:X.shape[1]]
    c = x0[X.shape[1]:]
    theta = L.dot(c)
    loss_fd = weights[y]

    Xw = X.dot(w)
    Alpha = theta[:, None] - Xw  # (n_class - 1, n_samples)
    S = np.sign(np.arange(n_class - 1)[:, None] - y + 0.5)

    err = loss_fd.T * log_loss(S * Alpha)
    if sample_weight is not None:
        err *= sample_weight
    obj = np.sum(err)
    obj += alpha * (np.sum(np.abs(beta)))  # l1 term
    return obj


def grad_margin(x0, X, y, alpha, n_class, weights, L, sample_weight):
    """
    Gradient for the general margin-based formulation
    """

    w = x0[:X.shape[1]]
    c = x0[X.shape[1]:]
    theta = L.dot(c)
    loss_fd = weights[y]

    Xw = X.dot(w)
    Alpha = theta[:, None] - Xw  # (n_class - 1, n_samples)
    S = np.sign(np.arange(n_class - 1)[:, None] - y + 0.5)
    # Alpha[idx] *= -1
    # W[idx.T] *= -1

    Sigma = S * loss_fd.T * sigmoid(-S * Alpha)
    if sample_weight is not None:
        Sigma *= sample_weight

    grad_w = X.T.dot(Sigma.sum(0)) + alpha * w

    grad_theta = -Sigma.sum(1)
    grad_c = L.T.dot(grad_theta)
    return np.concatenate((grad_w, grad_c), axis=0)


def threshold_fit(X, y, alpha, n_class, mode='AE',
                  max_iter=1000, verbose=False, tol=1e-12,
                  sample_weight=None):
    """
    Solve the general threshold-based ordinal regression model
    using the logistic loss as surrogate of the 0-1 loss
    Parameters
    ----------
    mode : string, one of {'AE', '0-1', 'SE'}
    """

    X, y = check_X_y(X, y, accept_sparse='csr')
    unique_y = np.sort(np.unique(y))
    if not np.all(unique_y == np.arange(unique_y.size)):
        raise ValueError(
            'Values in y must be %s, instead got %s'
            % (np.arange(unique_y.size), unique_y))

    n_samples, n_features = X.shape

    # convert from c to theta
    L = np.zeros((n_class - 1, n_class - 1))
    L[np.tril_indices(n_class-1)] = 1.

    if mode == 'AE':
        # loss forward difference
        loss_fd = np.ones((n_class, n_class - 1))
    elif mode == '0-1':
        loss_fd = np.diag(np.ones(n_class - 1)) + \
            np.diag(np.ones(n_class - 2), k=-1)
        loss_fd = np.vstack((loss_fd, np.zeros(n_class - 1)))
        loss_fd[-1, -1] = 1  # border case
    elif mode == 'SE':
        a = np.arange(n_class-1)
        b = np.arange(n_class)
        loss_fd = np.abs((a - b[:, None])**2 - (a - b[:, None]+1)**2)
    else:
        raise NotImplementedError

    x0 = np.zeros(n_features + n_class - 1)
    x0[X.shape[1]:] = np.arange(n_class - 1)
    options = {'maxiter' : max_iter, 'disp': verbose}
    if n_class > 2:
        bounds = [(None, None)] * (n_features + 1) + \
                 [(0, None)] * (n_class - 2)
    else:
        bounds = None

    sol = optimize.minimize(obj_margin, x0, method='L-BFGS-B',
        jac=grad_margin, bounds=bounds, options=options,
        args=(X, y, alpha, n_class, loss_fd, L, sample_weight),
        tol=tol)
    if verbose and not sol.success:
        print(sol.message)

    w, c = sol.x[:X.shape[1]], sol.x[X.shape[1]:]
    theta = L.dot(c)
    return w, theta

threshold_fit(ac_X_train, ac_y_train, 1, np.unique(ac_y_train).size, mode="AE")