This is based on Fabian Pedregosas [blog](https://fa.bianp.net/blog/2013/logistic-ordinal-regression/) and deprecated [github](https://github.com/fabianp/minirank/blob/master/minirank/logistic.py)

In [1]:
import numpy as np
from numpy.random import default_rng
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from scipy import optimize, linalg, sparse

In [2]:
rng = default_rng(1234)

# Simulation Data

Use a subsample of the simulation data for testing


In [3]:
from masterthesis.data import load_simdata, load_acinar
data = load_simdata("/home/julian/Uni/MasterThesis/data/simdata.h5ad")

In [4]:
idx = data.var['Setting'] == "TS"  # extract only the time series samples
sim_X = data.X[:, idx]
sim_y = data.obs["Ordinal_Time_Labels"]

In [5]:
# random subsample genes
y_idx = rng.choice(np.arange(sim_y.size), size=sim_y.size // 2, replace=False)
x_idx = rng.choice(np.arange(sim_X.shape[1]), size=sim_X.shape[1] // 4, replace=False)
sim_X = sim_X[y_idx, :]
sim_X = sim_X[:, x_idx]
sim_y = sim_y[y_idx]

In [6]:
scaler = StandardScaler()
sim_X = scaler.fit_transform(sim_X)

In [7]:
print(sim_X.shape)
print(sim_y.shape)

(286, 1031)
(286,)


# Acinar Data

In [8]:
acinar_ann = load_acinar()

In [9]:
# selected Genes after preprocessing in R
sel_genes = ["REG3A", "AMY2A", "MT2A", "OLFM4",
             "SYCN", "CELA2B", "FGL1", "AMY2B",
             "MT1G", "TM4SF1", "CELA2A", "PDK4", 
             "TACSTD2", "CD44", "PNLIPRP2", "ALB", 
             "ERP27", "LDHA", "REG3G", "CTRL", "CLPS",
             "FOS", "HSPA8", "SERPINA3", "CELA3B", "CRP"]

In [10]:
from sklearn.model_selection import train_test_split

ac_y = np.array([int(x) for x in acinar_ann.obs.donor_age])
ac_label_conv = dict(zip(np.unique(ac_y), range(len(ac_y))))
ac_y = np.array([ac_label_conv[l] for l in ac_y])
k = len(np.unique(ac_y))

ac_X_train, ac_X_test, ac_y_train, ac_y_test = train_test_split(acinar_ann[:,sel_genes].X, ac_y, 
                                                                test_size=0.1, 
                                                                stratify=ac_y,
                                                                random_state=1234)

In [11]:
scaler = StandardScaler()
ac_X_train = scaler.fit_transform(ac_X_train)
ac_X_test = scaler.fit_transform(ac_X_test)

In [12]:
np.log(-3.34098243e-1)

  np.log(-3.34098243e-1)


nan

## Gradient DIY

In [14]:
def phi(beta, thresholds, X):
    phi = 1 / (1 + np.exp(beta.T @ X - thresholds))
    return phi

In [15]:
def l1_regularizer(beta, l):
    return l * np.sum(np.abs(beta))

In [16]:
BIG = 1e10

# PROBLEM: Is the objective minimized or maximized?? -> Scipy MINIMIZES
# log likelihood with regularization as objective function
def objective(params, X, y, k, lamb=0.1):
    beta = params[:-k]
    theta = params[-k:]
    thresholds = np.array([theta[i] for i in y])
    thresholds_m1 = np.array([theta[max(0, i-1)] for i in y]) 

    # fit term
    # PROBLEM: What if diff is negative? -> Log has issues, but value may be meaningful
    diff = phi(beta, thresholds, X) - phi(beta, thresholds_m1, X)
    loss = np.sum(np.log(diff, out=np.zeros_like(diff), where=(diff > 0)))

    # regularization term
    loss -= l1_regularizer(beta, lamb)
    
    # scipy MINIMIZES the loss
    loss *= -1

    #print("avg loss:", loss.mean())
    return loss

In [17]:
def objective_grad(params, X, y, k):
    beta = params[:-k]
    theta = params[-k:]
    thresholds = np.array([theta[i] for i in y])
    thresholds_m1 = np.array([theta[max(0, i-1)] for i in y])
    
    beta_grad = np.sum(X * (1 - phi(beta, thresholds, X) - phi(beta, thresholds_m1, X)),
                       axis=1)
    
    
    # first half of the gradient
    e = np.identity(k)  # identity
    e_expanded = np.concatenate([e[i] for i in y]).reshape(y.size, k).T
    
    #print("Thresholds shape", thresholds.shape)
    #print("E exp shape", e_expanded.shape)
    #print("threshold diff", thresholds_m1 - thresholds)
    
    # PROBLEM: thresholds_m1 - thresholds can be 0 -> then the denominator becomes 0!
    temp = (1 
            - phi(beta, thresholds, X) 
            - np.exp(np.log(1 - (1 - np.exp(thresholds_m1 - thresholds)))))
    t1 = e_expanded @ temp
    
    #print("phi:", phi(beta, thresholds, X))
    #print("exp:", np.exp(thresholds_m1 - thresholds))
    #print("t1", t1)
    #print("temp shape", temp.shape)
    #print("t1 shape", t1.shape)
    
    # second half of the gradient
    e_m1 = np.identity(k+1)[1:,:-1]  # identity with diagonal shifty up by one
    e_m1_expanded = np.concatenate([e_m1[i] for i in y]).reshape(y.size, k).T
    temp = (1 
            - phi(beta, thresholds_m1, X) 
            - np.exp(np.log(1 - (1 - np.exp(-1 * (thresholds_m1 - thresholds))))))
    
    t2 = e_m1_expanded @ temp
    
    #print("t2", t2)
    #print("t2 shape", t2.shape)
    theta_grad = t1 + t2

    #print("theta_grad", theta_grad)

    return np.concatenate([beta_grad, theta_grad])

In [18]:
y = [1,2,3,3,0,0,0,1]
e = np.identity(4)
foo = np.concatenate([e[i] for i in y]).reshape(len(y), 4).T
foo

array([[0., 0., 0., 0., 1., 1., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 1., 0., 0., 0., 0.]])

In [19]:
X = np.arange(10).reshape(2, 5)
y = np.array([1,2,0,3,4])
beta = np.arange(2)
theta = np.array([-0.5, -0.1, 0.0123, 0.56, 1])
k = theta.size
params = np.concatenate([beta, theta])
print(X.shape)
print(y.shape)
print(objective_grad(params, X, y, k))
print(objective(params, X, y, theta.size, 0.1))

(2, 5)
(5,)
[ 9.98806316e+00  3.49007917e+01 -5.52778637e-04  3.23067374e-01
 -3.92174067e-01  3.00048424e-01 -3.73982387e-01]
31.892256931594275


In [20]:
from scipy.optimize import minimize
k = np.unique(ac_y_train).size
params = np.zeros(ac_X_train.shape[1] + k)
minimize(objective, params, args=(ac_X_train.T, ac_y_train, k), jac=objective_grad, method="BFGS")

      fun: -0.0
 hess_inv: array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])
      jac: array([  0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,
         0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. ,   0. , -24.5,
       -73. , -64. , -49. , -38. , -38.5, -42. , -24. ])
  message: 'Desired error not necessarily achieved due to precision loss.'
     nfev: 113
      nit: 0
     njev: 101
   status: 2
  success: False
        x: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## Use Jax autograd

In [24]:
from jax import grad, jit, vmap
from jax import numpy as jnp

In [61]:
LMBD = 0.1
BIG = 1e10
X = ac_X_train
y = ac_y_train
k = jnp.unique(ac_y_train).size

def jphi(beta, thresholds):
    phi = 1 / (1 + jnp.exp(jnp.dot(beta.T, X) - thresholds))
    return phi

def jl1_regularizer(beta):
    return LMBD * jnp.sum(jnp.abs(beta))

# PROBLEM: Is the objective minimized or maximized?? -> Scipy MINIMIZES
# log likelihood with regularization as objective function
def jobjective(params):
    beta = params[:-k]
    theta = params[-k:]

    thresholds = jnp.array([theta[i] for i in y])
    thresholds_m1 = jnp.array([theta[max(0, i-1)] for i in y]) 

    # fit term
    # PROBLEM: What if diff is negative? -> Log has issues, but value may be meaningful
    diff = jphi(beta, thresholds) - jphi(beta, thresholds_m1)
    loss = jnp.nan_to_num(jnp.log(diff), nan=0)
    loss = jnp.sum(loss)

    # regularization term
    loss -= jl1_regularizer(beta)
    
    # scipy MINIMIZES the loss
    loss *= -1

    #print("avg loss:", loss.mean())
    return loss

In [64]:
LMBD = 0.1
BIG = 1e10
X = ac_X_train.T
y = ac_y_train
k = jnp.unique(ac_y_train).size
params = np.zeros(ac_X_train.shape[1] + k)

objective_jaxder = grad(jobjective)
minimize(jobjective, params, jac=objective_jaxder, method="BFGS")

      fun: inf
 hess_inv: array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1]])
      jac: array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan, nan], dtype=float32)
  message: 'NaN result encountered.'
     nfev: 1
      nit: 0
     njev: 1
   status: 3
  success: False
        x: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## Gradient Model v3 

Based on [mord by fabian](https://github.com/fabianp/mord/blob/master/mord/threshold_based.py)

In [None]:

def sigmoid(t):
    # sigmoid function, 1 / (1 + exp(-t))
    # stable computation
    idx = t > 0
    out = np.zeros_like(t)
    out[idx] = 1. / (1 + np.exp(-t[idx]))
    exp_t = np.exp(t[~idx])
    out[~idx] = exp_t / (1. + exp_t)
    return out


def log_loss(Z):
    # stable computation of the logistic loss
    idx = Z > 0
    out = np.zeros_like(Z)
    out[idx] = np.log(1 + np.exp(-Z[idx]))
    out[~idx] = (-Z[~idx] + np.log(1 + np.exp(Z[~idx])))
    return out


def obj_margin(x0, X, y, alpha, n_class, weights, L, sample_weight):
    """
    Objective function for the general margin-based formulation
    """

    w = x0[:X.shape[1]]
    c = x0[X.shape[1]:]
    theta = L.dot(c)
    loss_fd = weights[y]

    Xw = X.dot(w)
    Alpha = theta[:, None] - Xw  # (n_class - 1, n_samples)
    S = np.sign(np.arange(n_class - 1)[:, None] - y + 0.5)

    err = loss_fd.T * log_loss(S * Alpha)
    if sample_weight is not None:
        err *= sample_weight
    obj = np.sum(err)
    obj += alpha * 0.5 * (np.dot(w, w))
    return obj


def grad_margin(x0, X, y, alpha, n_class, weights, L, sample_weight):
    """
    Gradient for the general margin-based formulation
    """

    w = x0[:X.shape[1]]
    c = x0[X.shape[1]:]
    theta = L.dot(c)
    loss_fd = weights[y]

    Xw = X.dot(w)
    Alpha = theta[:, None] - Xw  # (n_class - 1, n_samples)
    S = np.sign(np.arange(n_class - 1)[:, None] - y + 0.5)
    # Alpha[idx] *= -1
    # W[idx.T] *= -1

    Sigma = S * loss_fd.T * sigmoid(-S * Alpha)
    if sample_weight is not None:
        Sigma *= sample_weight

    grad_w = X.T.dot(Sigma.sum(0)) + alpha * w

    grad_theta = -Sigma.sum(1)
    grad_c = L.T.dot(grad_theta)
    return np.concatenate((grad_w, grad_c), axis=0)


def threshold_fit(X, y, alpha, n_class, mode='AE',
                  max_iter=1000, verbose=False, tol=1e-12,
                  sample_weight=None):
    """
    Solve the general threshold-based ordinal regression model
    using the logistic loss as surrogate of the 0-1 loss
    Parameters
    ----------
    mode : string, one of {'AE', '0-1', 'SE'}
    """

    X, y = check_X_y(X, y, accept_sparse='csr')
    unique_y = np.sort(np.unique(y))
    if not np.all(unique_y == np.arange(unique_y.size)):
        raise ValueError(
            'Values in y must be %s, instead got %s'
            % (np.arange(unique_y.size), unique_y))

    n_samples, n_features = X.shape

    # convert from c to theta
    L = np.zeros((n_class - 1, n_class - 1))
    L[np.tril_indices(n_class-1)] = 1.

    if mode == 'AE':
        # loss forward difference
        loss_fd = np.ones((n_class, n_class - 1))
    elif mode == '0-1':
        loss_fd = np.diag(np.ones(n_class - 1)) + \
            np.diag(np.ones(n_class - 2), k=-1)
        loss_fd = np.vstack((loss_fd, np.zeros(n_class - 1)))
        loss_fd[-1, -1] = 1  # border case
    elif mode == 'SE':
        a = np.arange(n_class-1)
        b = np.arange(n_class)
        loss_fd = np.abs((a - b[:, None])**2 - (a - b[:, None]+1)**2)
    else:
        raise NotImplementedError

    x0 = np.zeros(n_features + n_class - 1)
    x0[X.shape[1]:] = np.arange(n_class - 1)
    options = {'maxiter' : max_iter, 'disp': verbose}
    if n_class > 2:
        bounds = [(None, None)] * (n_features + 1) + \
                 [(0, None)] * (n_class - 2)
    else:
        bounds = None

    sol = optimize.minimize(obj_margin, x0, method='L-BFGS-B',
        jac=grad_margin, bounds=bounds, options=options,
        args=(X, y, alpha, n_class, loss_fd, L, sample_weight),
        tol=tol)
    if verbose and not sol.success:
        print(sol.message)

    w, c = sol.x[:X.shape[1]], sol.x[X.shape[1]:]
    theta = L.dot(c)
    return w, theta