# Baseline Model in python

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.special import logit
from sklearn.metrics import accuracy_score, log_loss as cross_entropy_score

In [2]:
from numpy.random import default_rng
rng = default_rng(seed=1234321)

In [3]:
from masterthesis.data import load_acinar, data_dir
# load the python AnnData object
acinar_ann = load_acinar()

In [4]:
print("Dataset shape", acinar_ann.X.shape)
print("First gene:", acinar_ann.X[:,0].shape)

In [5]:
# Access Gene Names
acinar_ann.var_names

In [6]:
# select one gene
acinar_ann[:, acinar_ann.var_names.str.match("A1CF")]

### R Gene selection and Test Split

In [7]:
# sampling in R with seed 1234
test_idx = [284, 336, 406, 101, 111, 393, 133, 400, 388, 98, 103, 214, 90, 326, 79, 372, 270, 382, 184, 62, 4, 403, 149, 40, 212, 195, 93, 122, 66, 175, 379, 304, 108, 131, 343, 41, 115, 228, 328, 298, 299]
train_idx = list(set(range(acinar_ann.X.shape[0])) - set(test_idx))

In [8]:
# selected Genes after preprocessing in R
sel_genes = ["REG3A", "AMY2A", "MT2A", "OLFM4",
             "SYCN", "CELA2B", "FGL1", "AMY2B",
             "MT1G", "TM4SF1", "CELA2A", "PDK4", 
             "TACSTD2", "CD44", "PNLIPRP2", "ALB", 
             "ERP27", "LDHA", "REG3G", "CTRL", "CLPS",
             "FOS", "HSPA8", "SERPINA3", "CELA3B", "CRP" ]

In [9]:
from sklearn.model_selection import train_test_split

y = np.array([int(x) for x in acinar_ann.obs.donor_age])
k = len(np.unique(y))
X_train, X_test, y_train, y_test = train_test_split(acinar_ann[:,sel_genes].X, y, 
                                                    test_size=0.1, 
                                                    stratify=y,
                                                    random_state=1234)

In [10]:
# old approach based on the indexes from R
#y_train = y[train_idx]
#y_test = y[test_idx]

#X_train = acinar_ann[test_idx, sel_genes].X
#X_test = acinar_ann[train_idx, sel_genes].X

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [12]:
print("Test X:", X_test.shape)
print("Test y:", y_test.shape)
print("Train X:", X_train.shape)
print("Train y:", y_train.shape)

## Model 1: mord

**Result: It was not possible to achieve the necessary sparsity with this model. Only L2 regularization is required. The thresholds seem not entirely plausible**

[Reference 1](https://medium.datadriveninvestor.com/logistic-regression-simple-multinomial-and-ordinal-b2bc886bb974) [Reference 2](https://pythonhosted.org/mord/)

In [13]:
from mord import LogisticAT, LogisticIT

In [14]:
# Transform y into a series of subsequent labels [0,1,2 ...]
transf = dict(zip(np.unique(y),
                  np.arange(0, len(np.unique(y)))))
                        
y_train_trans = np.array([transf[e] for e in y_train])
y_test_trans = np.array([transf[e] for e in y_test])

# reordering, such that yi < yi+1
train_reorder = np.argsort(y_train_trans)
test_reorder = np.argsort(y_test_trans)

In [15]:
# all-threshold fit
regressor = LogisticAT(verbose=0, alpha=0.1)
regressor.fit(X_train, y_train_trans)
regressor.score(X_test, y_test_trans)

In [16]:
# immediate-threshold fit
regressor = LogisticIT(verbose=0, alpha=0, )
regressor.fit(X_train[train_reorder], y_train_trans[train_reorder])
regressor.score(X_test[test_reorder], y_test_trans[test_reorder])

In [17]:
regressor.coef_

In [18]:
regressor.theta_

## Model 2: Ordered Multinomial Regression (statsmodels) 

**Result: Introducing sparsity, or even using any regularizer seems to not be supported, or at least I didn't find a way.**

In [None]:
import scipy.stats as stats
from statsmodels.miscmodels.ordinal_model import OrderedModel

In [None]:
mod_prob = OrderedModel(y_train,
                        X_train,
                        distr='logit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.summary().tables[0]

In [None]:
res_prob.params

In [None]:
predictions = res_prob.model.predict(res_prob.params, exog=X_test)
print("Predictions:", predictions.argmax(1))
print("Ground Truth:", y_test_trans)
print("Cross Entropy:", cross_entropy_score(y_test, predictions, labels=np.unique(y)))
print("Accuracy:", accuracy_score(predictions.argmax(1), y_test_trans))

## Model 3: Multinomial Regression (sklearn)

**Results:** 
- Introduction of sparsity worked well with the l1 penalty
- Prediction results were on par with the other methods
- However, since this is solved as a multinomial regression problem, one set of parameters is fit for each prediction class. This introduces a new problem for selection of parameters: The weights have to be aggregated, which has eliminated the sparsity with the attempted approaches.

In [None]:
from sklearn.linear_model import LogisticRegression

sk_model = LogisticRegression(penalty="l1",
                              multi_class="multinomial", # "auto", "ovr", "multinomial"
                              solver="saga",
                              random_state=12345)
sk_model.fit(X_train, y_train)

In [None]:
print("Model coefficients shape:", sk_model.coef_.shape)
print("Train score:", sk_model.score(X_train, y_train))
print("Test score:", sk_model.score(X_test, y_test))

#### Aggregation of Weights from multinomial model


In [None]:
# defines a threshold below which a gene is not considered significant
# this is arbitrary, there is no way of defining this
sparsity_threshold = 0.0001

skl_mm_added = np.add.reduce(sk_model.coef_, axis=0)
print("Added weights from Muli-Class model")
print("sparsity:", sum(np.abs(skl_mm_added) < sparsity_threshold))

skl_mm_mean = skl_mm_added / sk_model.coef_.shape[1]
print("Average weights from Muli-Class model")
print("sparsity:", sum(np.abs(skl_mm_mean) < sparsity_threshold))

## Models 4-6: Binary LogisticRegression (sklearn)

### Convert the data

- The labels are converted to binary, such that the threshold from 0-1 corresponds from changing from label $l_i$ to $l_{i+1}$. $k$ copies of the label vector are concatenated such that for every vector $j$ the labels  $l_i$ with $i<j$ are converted to 0 and the labels $i\ge j$ are converted to 1.
- The count matrix is extended with copies of itself, to fit the converted label vector FOR NOW. For big problems, it could suffice to have just one label vector and perform and iterative training.
- To train the thresholds, $k$ columns are added to the count matrix and initialized to zero. Each column column represents the threshold for a label $l_i$ and is set to 1, exactly  where that label $l_1$ occurs.

In [None]:
def to_bin_y(y_orig):
    y_classes = np.unique(y_orig)
    k = len(y_classes)

    y_bin = []
    for ki in range(1,k):
        thresh = y_classes[ki]
        y_bin += [int(x>=thresh) for x in y_orig]

    y_bin = np.array(y_bin)
    
    return y_bin 

In [None]:
def to_bin_X(X_orig, k):

    # X training matrix
    X_bin = np.concatenate([X_orig.copy()] * (k-1))
    # Add thresholds
    num_el = X_orig.shape[0] * (k-1)

    for ki in range(k-1):
        temp = np.repeat(0, num_el).reshape(X_orig.shape[0], (k-1))
        temp[:,ki] = 1
        if ki > 0:
            thresholds = np.concatenate([thresholds, temp])
        else:
            thresholds = temp

    X_bin = np.concatenate([X_bin, thresholds], axis=1)

    return X_bin

In [None]:
y_train_bin = to_bin_y(y_train)
print("y_train:", len(y_train_bin))

y_test_bin = to_bin_y(y_test)
print("y_test:", len(y_test_bin))

In [None]:
X_train_bin = to_bin_X(X_train, k=np.unique(y).size)
print("X_train_bin:", X_train_bin.shape)

X_test_bin = to_bin_X(X_test, k=np.unique(y).size)
print("X_test_bin:", X_test_bin.shape)

### Model 4: LogisticRegression (sklearn)

In [None]:
from sklearn.linear_model import LogisticRegression

sk_binlogreg_model = LogisticRegression(penalty="l1", 
                                  fit_intercept=False,
                                  max_iter=10000,
                                  solver="liblinear",
                                  random_state=1234,
                                  C=0.01  # Inverse of regularization strength -> controls sparsity in our case!
                                 )

sk_binlogreg_model.fit(X_train_bin, y_train_bin)

In [None]:
print("Train score:",sk_binlogreg_model.score(X_train_bin, y_train_bin))
print("Test score:",sk_binlogreg_model.score(X_test_bin, y_test_bin))

In [None]:
sk_binlogreg_model.coef_

## Binary Logistic Regression with GLMnet

In [None]:
from glmnet import LogitNet

# Note: Alpha is the regularization mixing parameter: alpha=1 -> L1, alpha=0 -> L2, 0<alpha<1 -> elastic net 
glmnet_model = LogitNet(alpha=1,
                        fit_intercept=False,
                        standardize=False, # already standardized
                        random_state=1234,
                        max_iter=10000)
glmnet_model.fit(X_train_bin, y_train_bin)

In [None]:
print("Train score: ", glmnet_model.score(X_train_bin, y_train_bin))
print("Test score: ", glmnet_model.score(X_test_bin, y_test_bin))

In [None]:
glmnet_model.coef_

## SGD with mini Batches

To reduce the memory load, this introduces a sampling method with an iterative training paradigm

ToDo: 

    - Construct matrix on the fly
    - Check convergence / early stopping
    

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(loss="log_loss",
                          penalty="l1",
                          alpha=0.01,  # = lambda in paper!! Very important to tune for the desired sparsity!
                          fit_intercept=False,
                          n_jobs=1)

cur_iter = 0
max_iter = 10
n_batches = 2

while cur_iter < max_iter:
    if (cur_iter > 0 and cur_iter % 2 == 0):
        print("Iter: ", cur_iter, "Train score: ", sgd_model.score(X_batch, y_batch))
    
    cur_iter += 1
    
    # fit from samples of the big matrix
    # TODO: Sampling from the big matrix directly is just for PoP,
    # and eliminates the purpose. Only the binarized y-vector should
    # be created and the indexes taken from the log count matrix.
    sampled_indices = np.random.randint(X_train_bin.shape[0], size=X_train_bin.shape[0])

    start = 0
    for i in range(1, n_batches+1):
        end = (i * X_train_bin.shape[0] // n_batches)
        idx = sampled_indices[start:end]
        X_batch = X_train_bin[idx,:]
        y_batch = y_train_bin[idx]
        start = end
        sgd_model.partial_fit(X_batch, y_batch, classes=np.unique(y_batch))

In [None]:
print("Train score:", sgd_model.score(X_train_bin, y_train_bin))
print("Test score:", sgd_model.score(X_test_bin, y_test_bin))

In [None]:
sgd_model.coef_

# Compare Parameters

- To compare parameters we first fincd the best regularization strength
    * The best regularization has the highest score across 5-fold CV
    * To increase sparsity, we choose the parameter with highest regularization, that lies within 1 standard error of the optimum
- Then we do N fits with different seeds and collect the parameter values. 
- Finally we compare the distributions of the collected parameter values visually and wrt KL-divergence

The models to investigate:  sklearn LinRegressor, GLMnet Mmodel, SGD LinRegressor, and Psupertime


In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV

n_folds = 5
kf = StratifiedKFold(n_splits=n_folds)

# elongate the origial non-binarized y-train data
# to enable stratification
y_train_elong = np.repeat(y_train, k-1)

cv_splits = kf.split(X_train_bin, y_train_elong)

In [None]:
from sklearn.metrics import accuracy_score, log_loss, make_scorer

scorers = {
    "accuracy": accuracy_score,
    "cross-entropy": log_loss
}

In [None]:
def dof(params):
    return np.count_nonzero(params != 0)

In [None]:
def cv_res_to_df(cv_results, scorers, reg_params=None):
    df = pd.DataFrame.from_dict(cv_results, orient="index").stack().to_frame()
    df = pd.DataFrame(df[0].values.tolist(), index=df.index)
    
    if reg_params is not None:
        df.columns = ["L=%s" % x for x in reg_params]

    for scorer in scorers.keys():
        df.loc[("mean", scorer), :] = df.xs(scorer, level=1).mean(axis=0)
    df.loc[("mean", "dof"), :] = df.xs("dof",level=1).mean(axis=0)
    
    return df.T
    

In [None]:
def find_optimal_param(res_df, reg_params, lower_increases_reg=True):
    trimmed = res_df.loc[res_df[("mean", "dof")] != 0]
    trimmed_max = trimmed[("mean", "accuracy")].max()
    trimmed_std = trimmed[("mean", "accuracy")].std()
    thresh = trimmed_max - trimmed_std
    above = trimmed[trimmed[("mean", "accuracy")] > thresh]

    if lower_increases_reg:
        idx = above.iloc[-1].name
    else:
        idx = above.iloc[0].name

    print("max:", trimmed_max, "std:", trimmed_std, "thresh:", thresh)
    print("Best average fit:", trimmed.loc[idx])
    print("Best parameter:", reg_params[idx])
    
    return reg_params[idx]        

**Important result: Choice of regularization path (lambda path) is critical for selection of best parameter!**

In [None]:
reg_params = np.concatenate((np.linspace(1,10,10)[::-1], np.logspace(1, 15, 40, base=0.5)))

## SKLearn linregressor:


In [None]:
# GRIDSEARCH:
# ------------

#scorers = {
#    "accuracy": make_scorer(accuracy_score),
#    "cross-entropy-loss": make_scorer(log_loss)
#}
#params = {"C": [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]}
#sk_binlogreg_model = LogisticRegression(penalty="l1", 
#                                  fit_intercept=False,
#                                  max_iter=10000,
#                                  solver="liblinear",
#                                  random_state=1234)
#sk_binlogreg_cv = GridSearchCV(sk_binlogreg_model,
#                               param_grid=params,
#                               refit=False,
#                               cv=kf.split(X_train_bin, y_train_elong),
#                               scoring=scorers)
#sk_binlogreg_cv.cv_results_

# Problem: Does not save parameters of intermediate models: Tracking of sparsity not possible
# Let's do it ourselves ..
# But, still interesting for the final package!

In [None]:
#reg_params = [1, 0.75, 0.5, 0.25, 0.1, 0.075, 0.05, 0.025, 0.01, 0.005, 0.001, 0.0005]
sk_reg_params = np.concatenate((np.linspace(1,5,10)[::-1], np.logspace(1, 15, 40, base=0.5)))
cv_results = dict()

for i, (cv_train_idx, cv_test_idx) in enumerate(kf.split(X_train_bin, y_train_elong)):
    
    s = "split_%s" % i
    print(s)

    cv_results[s] = dict()
    cv_results[s]["dof"] = []
    for scorer in scorers.keys():
        cv_results[s][scorer] = []
    
    for c in sk_reg_params:
        model = LogisticRegression(penalty="l1",
                                   C=c,
                                   fit_intercept=False,
                                   max_iter=10000,
                                   solver="liblinear",
                                   random_state=1357);
        
        model.fit(X_train_bin[cv_train_idx,] , y_train_bin[cv_train_idx])
        
        for scorer in scorers.keys():
            predicted = model.predict(X_train_bin[cv_test_idx,])            
            score = scorers[scorer](y_train_bin[cv_test_idx], predicted)
            cv_results[s][scorer].append(score)
        
        cv_results[s]["dof"].append(dof(model.coef_))


sk_linreg_res = cv_res_to_df(cv_results, scorers)

In [None]:
sk_linreg_res

In [None]:
sk_best_reg = find_optimal_param(sk_linreg_res, sk_reg_params)

## glmnet model

In [None]:
from glmnet import LogitNet

glmnet_reg_params = np.concatenate((np.linspace(1,10,10)[::-1], np.logspace(1, 15, 20, base=0.5)))
cv_results = dict()

for i, (cv_train_idx, cv_test_idx) in enumerate(kf.split(X_train_bin, y_train_elong)):
    
    s = "split_%s" % i
    print(s)

    cv_results[s] = dict()
    cv_results[s]["dof"] = []
    for scorer in scorers.keys():
        cv_results[s][scorer] = []
    
    for l in glmnet_reg_params:
        model = LogitNet(alpha=1,
                         lambda_path=[l],
                         fit_intercept=False,
                         standardize=False,
                         random_state=1234,
                         max_iter=10000)

        model.fit(X_train_bin[cv_train_idx,] , y_train_bin[cv_train_idx])

        for scorer in scorers.keys():
            predicted = model.predict(X_train_bin[cv_test_idx,])            
            score = scorers[scorer](y_train_bin[cv_test_idx], predicted)
            cv_results[s][scorer].append(score)

        cv_results[s]["dof"].append(dof(model.coef_))

glmnet_cv_res = cv_res_to_df(cv_results, scorers)

In [None]:
glmnet_cv_res

In [None]:
glmnet_best_reg = find_optimal_param(glmnet_cv_res, glmnet_reg_params, lower_increases_reg=False)

## SGD Model

In [None]:
from glmnet import LogitNet

sgd_reg_params = reg_params
cv_results = dict()

# fixed model training params
max_iter = 50

for i, (cv_train_idx, cv_test_idx) in enumerate(kf.split(X_train_bin, y_train_elong)):
    
    s = "split_%s" % i
    print(s)

    cv_results[s] = dict()
    cv_results[s]["dof"] = []
    for scorer in scorers.keys():
        cv_results[s][scorer] = []
    
    for a in sgd_reg_params:
        
        model = SGDClassifier(loss="log_loss",
                              penalty="l1",
                              alpha=a,  # = lambda in paper!! Very important to tune for the desired sparsity!
                              fit_intercept=False,
                              random_state=121,
                              n_jobs=1)
        cur_iter = 0

        while cur_iter < max_iter:
            cur_iter += 1

            # fit from samples of the big matrix
            # TODO: Sampling from the big matrix directly is just for PoP,
            # and eliminates the purpose. Only the binarized y-vector should
            # be created and the indexes taken from the log count matrix.
            rng.shuffle(cv_train_idx)
            model.partial_fit(X_train_bin[cv_train_idx,], y_train_bin[cv_train_idx], classes=np.unique(y_batch))

        for scorer in scorers.keys():
            predicted = model.predict(X_train_bin[cv_test_idx,])            
            score = scorers[scorer](y_train_bin[cv_test_idx], predicted)
            cv_results[s][scorer].append(score)

        cv_results[s]["dof"].append(dof(model.coef_))

sgd_cv_res = cv_res_to_df(cv_results, scorers)


In [None]:
sgd_cv_res

In [None]:
sgd_best_reg = find_optimal_param(sgd_cv_res, sgd_reg_params)

## Compare Fits 

In [None]:
fig = plt.figure(figsize=(12,6))

steps = list(range(len(reg_params)))
legend = ["glmnet", "sklinreg", "sgd"]

p0 = fig.add_subplot(131)
p0.plot(steps, glmnet_cv_res[("mean", "dof")], label=legend[0])
p0.plot(steps, sk_linreg_res[("mean", "dof")], label=legend[1])
p0.plot(steps, sgd_cv_res[("mean", "dof")], label=legend[2])
p0.set_ylabel("DoF")
p0.legend()

p1 = fig.add_subplot(132)
p1.plot(steps, glmnet_cv_res[("mean", "accuracy")], label=legend[0])
p1.plot(steps, sk_linreg_res[("mean", "accuracy")], label=legend[1])
p1.plot(steps, sgd_cv_res[("mean", "accuracy")], label=legend[2])
p1.set_ylabel("Accuracy")
p1.legend()

p2 = fig.add_subplot(133)
p2.plot(steps, glmnet_reg_params, label=legend[0])
p2.plot(steps, sk_reg_params, label=legend[1])
p2.plot(steps, sgd_reg_params, label=legend[2])
p2.set_yscale("log")
p2.set_ylabel("Regularization")
p2.legend()

fig.tight_layout()