In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from sklearn.preprocessing import StandardScaler
from scipy.special import logit

In [2]:
from numpy.random import default_rng
rng = default_rng(seed=1234)

In [3]:
from masterthesis.data import load_h5ad

# load the python AnnData object
acinar_ann = load_h5ad("/home/julian/Uni/MasterThesis/data/acinar_sce.h5ad")

# Data

In [4]:
# sampling in R with seed 1234
test_idx = [284, 336, 406, 101, 111, 393, 133, 400, 388, 98, 103, 214, 90, 326, 79, 372, 270, 382, 184, 62, 4, 403, 149, 40, 212, 195, 93, 122, 66, 175, 379, 304, 108, 131, 343, 41, 115, 228, 328, 298, 299]
train_idx = list(set(range(acinar_ann.X.shape[0])) - set(test_idx))

# selected Genes after preprocessing in R
sel_genes = ["REG3A", "AMY2A", "MT2A", "OLFM4",
             "SYCN", "CELA2B", "FGL1", "AMY2B",
             "MT1G", "TM4SF1", "CELA2A", "PDK4", 
             "TACSTD2", "CD44", "PNLIPRP2", "ALB", 
             "ERP27", "LDHA", "REG3G", "CTRL", "CLPS",
             "FOS", "HSPA8", "SERPINA3", "CELA3B", "CRP" ]

sel_genes = sorted(sel_genes)

from sklearn.model_selection import train_test_split

y = np.array([int(x) for x in acinar_ann.obs.donor_age])
k = len(np.unique(y))
X_train, X_test, y_train, y_test = train_test_split(acinar_ann[:,sel_genes].X, y, 
                                                    test_size=0.1, 
                                                    stratify=y,
                                                    random_state=1234)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(369, 26)
(42, 26)
(369,)
(42,)


# Preprocessing

In [5]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [6]:
# Transform y into a series of subsequent labels [0,1,2 ...]
transf = dict(zip(np.unique(y),
                  np.arange(0, len(np.unique(y)))))
                        
y_train_trans = np.array([transf[e] for e in y_train])
y_test_trans = np.array([transf[e] for e in y_test])

# reordering, such that yi < yi+1
train_reorder = np.argsort(y_train_trans)
test_reorder = np.argsort(y_test_trans)

In [7]:
from masterthesis.data import restructure_X_to_bin, restructure_y_to_bin

y_train_bin = restructure_y_to_bin(y_train)
print("y_train:", len(y_train_bin))

y_test_bin = restructure_y_to_bin(y_test)
print("y_test:", len(y_test_bin))

X_train_bin = restructure_X_to_bin(X_train, n_thresholds=np.unique(y).size -1 )
print("X_train_bin:", X_train_bin.shape)

X_test_bin = restructure_X_to_bin(X_test, n_thresholds=np.unique(y).size - 1)
print("X_test_bin:", X_test_bin.shape)

y_train: 2583
y_test: 294
X_train_bin: (2583, 33)
X_test_bin: (294, 33)


# Models

R params (25 degrees of freedom model (including thresholds))
```
ALB        AMY2A        AMY2B         CD44       CELA2A       CELA2B       CELA3B         CLPS 
 0.318453883  0.000000000  0.436426019  0.163187190  0.000000000 -0.703729963  0.654379359  0.000000000 
         CRP         CTRL        ERP27         FGL1          FOS        HSPA8         LDHA         MT1G 
 0.025537654 -0.024909444 -0.541135669  0.000000000 -0.265368767 -0.005586048  0.000000000  0.000000000 
        MT2A        OLFM4         PDK4     PNLIPRP2        REG3A        REG3G     SERPINA3         SYCN 
 0.337725221  0.000000000  0.235017877 -0.167188413  0.636231174 -0.558136212  1.128825473  0.000000000 
     TACSTD2       TM4SF1          cp1          cp2          cp3          cp4          cp5          cp6 
-0.528875018  0.227809234  5.192485102  2.536847033  2.003129014  0.837138551  0.677894210 -1.051345724 
         cp7 
-1.691617612 
```

In [23]:
from masterthesis.model import LinearBinarizedModel
from masterthesis.metrics import abs_delta, class_error

# Fitting
lin_model = LinearBinarizedModel(regularization=1000).fit(X_train, y_train)

print(lin_model.beta)
print(lin_model.theta)

# Predictions
y_pred_train = lin_model.predict(X_train)

print("(Train) Mean Absolute Delta:", abs_delta(y_train_trans, y_pred_train, mean=True))
print("(Train) Class Error:", class_error(y_train_trans, y_pred_train))

y_pred_test = lin_model.predict(X_test)

print("(Test) Predictions:", y_pred_test)

print("(Test) Mean Absolute Delta:", abs_delta(y_test_trans, y_pred_test, mean=True))
print("(Test) Class Error:", class_error(y_test_trans, y_pred_test))

lin_model

[ 2.93900142e-01  1.79770339e-01  5.63835038e-01  6.70140794e-01
 -2.26899530e-02 -8.00216724e-01  9.62427817e-01 -8.03677961e-01
 -7.10537999e-04 -7.47236994e-01 -5.43638653e-01  1.83489646e-01
 -1.94134175e-01 -7.13952822e-02 -8.26001839e-03  3.53619138e-02
  6.24720432e-01  9.32382957e-02  4.89473755e-01 -3.69608435e-01
  7.46705116e-01 -4.12876340e-01  8.85006579e-01  7.56907854e-01
 -5.16777262e-01  1.09066106e-01]
[-4.19418722 -3.47351606 -1.5363196  -1.3317969   0.09230499  0.75199896
  3.46038067]
(Train) Mean Absolute Delta: 1.7100271002710028
(Train) Class Error: 0.16802168021680217
(Test) Predictions: [6 4 6 4 2 7 2 6 4 4 6 2 0 6 6 4 6 6 6 6 7 4 6 4 4 6 4 2 6 6 2 6 0 2 6 4 0
 6 0 6 2 6]
(Test) Mean Absolute Delta: 1.9047619047619047
(Test) Class Error: 0.19047619047619047


In [9]:
from masterthesis.model import SGDBinarizedModel

# Fitting
sgd_model = SGDBinarizedModel(regularization=0.01, max_iter=5).fit(X_train, y_train)

print(sgd_model.beta)
print(sgd_model.theta)

# Predictions
y_pred_train = sgd_model.predict(X_train)

print("(Train) Mean Absolute Delta:", abs_delta(y_train_trans, y_pred_train, mean=True))
print("(Train) Class Error:", class_error(y_train_trans, y_pred_train))

y_pred_test = sgd_model.predict(X_test)

print("(Test) Predictions:", y_pred_test)

print("(Test) Mean Absolute Delta:", abs_delta(y_test_trans, y_pred_test, mean=True))
print("(Test) Class Error:", class_error(y_test_trans, y_pred_test))

Iter:  2 Train score:  0.8691950464396285
Iter:  4 Train score:  0.8738390092879257
[ 0.01664695  0.          0.29847146  0.3564868   0.         -0.51959845
  0.14481885  0.          0.         -0.5153116  -0.3192593   0.01919667
 -0.06470973 -0.01847443  0.          0.          0.62014956  0.11118106
  0.23018893 -0.15575448  0.6101014  -0.46300347  0.63497354  0.
 -0.23404583  0.        ]
[-3.44794172 -2.34751961 -0.79430111 -0.56780821  0.          0.01500561
  2.62129456]
(Train) Mean Absolute Delta: 1.7615176151761518
(Train) Class Error: 0.17615176151761516
(Test) Predictions: [6 6 6 6 2 7 2 6 6 6 6 6 0 6 6 2 6 6 6 6 7 6 6 6 6 6 6 2 6 6 2 6 0 2 6 6 2
 6 0 6 2 6]
(Test) Mean Absolute Delta: 2.238095238095238
(Test) Class Error: 0.19047619047619047
