In [1]:
import datetime
print(datetime.datetime.now())

2023-08-01 01:44:40.421996


In [2]:
import os
from sklearn import metrics
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from masterthesis.data import load_h5ad
from masterthesis.preprocessing import calculate_weights, transform_labels
from sklearn.model_selection import train_test_split
from masterthesis.model_selection import RegularizationGridSearch
from masterthesis.model import VanillaSGDBinarizedModel
from psupertime.model import BaselineSGDModel
from sklearn import metrics

import warnings

In [3]:
from sklearn.linear_model import SGDClassifier

# elastic net Classifier
class ElasticNetBinSGD(SGDClassifier):
    def __init__(self, regularization=0.1,
                 n_jobs=1,
                 max_iter=100, 
                 random_state=12345, 
                 n_iter_no_change=5, 
                 early_stopping=True,
                 tol=1e-3,
                 learning_rate="optimal",
                 eta0=0):
        self.regularization = regularization
        super().__init__(alpha=regularization, loss="log_loss", penalty="elasticnet", l1_ratio=0.8, fit_intercept=True, n_jobs=n_jobs,
                         max_iter=max_iter, random_state=random_state, n_iter_no_change=n_iter_no_change, 
                         early_stopping=early_stopping, tol=tol, learning_rate=learning_rate, eta0=eta0)


In [4]:
#estimator_class = ElasticNetBinSGD
estimator_class = BaselineSGDModel
is_bin_model = False

# report files
genes_outfile = "genes_py_5-elastic08.txt"
results_outfile = "results_py_5-elastic08.txt"

# Fit params
n_seeds = 5
n_folds = 5
n_jobs = 4
n_reg_params = 20
reg_params = np.geomspace(0.25, 0.005, n_reg_params)
scoring = metrics.make_scorer(metrics.accuracy_score)

# Simulation data
data_dir = "/home/julian/Uni/MasterThesis/data"
filenames = [
    "simdata_v2_TS0.1_SS0.1.h5ad", 
    "simdata_v2_TS0.1_SS0.3.h5ad",
    "simdata_v2_TS0.1_SS0.5.h5ad",
    "simdata_v2_TS0.1_SS0.7.h5ad",
    "simdata_v2_TS0.1_SS0.9.h5ad",
    "simdata_v2_TS0.3_SS0.1.h5ad",
    "simdata_v2_TS0.3_SS0.3.h5ad",
    "simdata_v2_TS0.3_SS0.5.h5ad",
    "simdata_v2_TS0.3_SS0.7.h5ad",
    "simdata_v2_TS0.5_SS0.1.h5ad",
    "simdata_v2_TS0.5_SS0.3.h5ad",
    "simdata_v2_TS0.5_SS0.5.h5ad",
    "simdata_v2_TS0.7_SS0.1.h5ad",
    "simdata_v2_TS0.7_SS0.3.h5ad",
    "simdata_v2_TS0.9_SS0.1.h5ad"
]

In [6]:
from masterthesis.model import BinaryModelMixin, LinearBinarizedModel

warnings.filterwarnings("once")

genes = []
results = {
    "file": [],
    "seed": [],
    "best_reg": [],
    "dof": [],
    "all_accuracy": [],
    "all_bal_acc": [],
    "all_abs_err": [],
    "train_accuracy": [],
    "train_bal_acc": [],
    "train_abs_err": [],
    "test_accuracy": [],
    "test_bal_acc": [],
    "test_abs_err": [],
    "spearman_corr": [],
    "pearson_corr": [],
    "precision": [],
    "sensitivity": [],
}

print("[*] Running Simulation")
print("[*] Regularization Params = ", reg_params)

for f in filenames:
    simfile = os.path.join(data_dir, f)
    print("[*] Reading file %s ..." % simfile)
    anndata = load_h5ad(simfile)

    anndata.obs["ordinal_label"] = transform_labels(np.array([int(x) for x in anndata.obs.Ordinal_Time_Labels]))
    X_train, X_test, y_train, y_test = train_test_split(anndata.X, anndata.obs["ordinal_label"], 
                                                        test_size=0.1, 
                                                        stratify=anndata.obs["ordinal_label"],
                                                        random_state=1234)
    
    if is_bin_model:
        # ---------------------------------------------------------------------------------------
        # Required to train the binary model directly
        X_train_bin = BinaryModelMixin.restructure_X_to_bin(X_train, len(np.unique(y_train)) - 1)
        y_train_bin = BinaryModelMixin.restructure_y_to_bin(y_train)
        # ---------------------------------------------------------------------------------------

    weights_all = calculate_weights(anndata.obs.Ordinal_Time_Labels)
    weights_train = calculate_weights(y_train)
    weights_test = calculate_weights(y_test)

    for i in range(n_seeds):

        seed = np.random.randint(9999)
        print("... Iteration %s, Seed=%s" % (i, seed))

        #print("... Cross Validation")
        sgd = RegularizationGridSearch(estimator=estimator_class,
                                       n_folds=n_folds,
                                       n_jobs=n_jobs,
                                       lambdas=reg_params,
                                       scoring=scoring)

        estimator_params = {"random_state": seed, "max_iter": 1000, "early_stopping": True}
        fit_params = None #{"sample_weight": weights_train}
        
        if not is_bin_model:
            sgd.fit(X_train, y_train, fit_params=fit_params, estimator_params=estimator_params)
            sparse_model = sgd.get_optimal_model("1se")
            sparse_model.fit(X_train, y_train)
        
        else:
            # ---------------------------------------------------------------------------------------
            # REQUIRED TO TRAIN THE BINARY MODEL DIRECTLY
            sgd.fit(X_train_bin, y_train_bin, fit_params=fit_params, estimator_params=estimator_params)

            print("... Refitting on training data")
            model = sgd.get_optimal_model("1se")
            model.fit(X_train_bin, y_train_bin) 

            # use wrapper
            sparse_model = LinearBinarizedModel(regularization=model.regularization)
            k = len(np.unique(y_train)) - 1
            sparse_model.k = k
            sparse_model.coef_ = model.coef_[0][:-k]
            sparse_model.intercept_ = [thresh + model.intercept_ for thresh in model.coef_[0][-k:]]
            sparse_model.is_fitted_ = True

            # ---------------------------------------------------------------------------------------
        
        # genes weights
        anndata.var["psupertime_weights"] = sparse_model.coef_
        genes += [anndata.var.psupertime_weights[anndata.var.psupertime_weights != 0]]

        # calculate psupertime -> adds anndata.obs.psupertime
        sparse_model.predict_psuper(anndata)
        pearsonr = anndata.obs.Latent_Time.corr(anndata.obs.psupertime)
        spearmanr = anndata.obs.Latent_Time.corr(anndata.obs.psupertime, method='spearman')
        kendalltau = anndata.obs.Latent_Time.corr(anndata.obs.psupertime, method='kendall')

        results["file"] += [f]
        results["seed"] += [seed]
        results["best_reg"] += [sparse_model.regularization]
        dof = len(np.nonzero(sparse_model.coef_)[0])
        results["dof"] += [dof]
        
        # scores on all data (for comparison, because psupertime only measures this)
        results["all_accuracy"] += [metrics.accuracy_score(anndata.obs.Ordinal_Time_Labels, anndata.obs.predicted_label)]
        results["all_bal_acc"] += [metrics.balanced_accuracy_score(anndata.obs.Ordinal_Time_Labels, anndata.obs.predicted_label)]
        results["all_abs_err"] += [metrics.mean_absolute_error(anndata.obs.Ordinal_Time_Labels,
                                                               anndata.obs.predicted_label,
                                                               sample_weight=weights_all)]
        
        # train scores
        results["train_accuracy"] += [metrics.accuracy_score(y_train, sparse_model.predict(X_train))]
        train_bacc = metrics.balanced_accuracy_score(y_train, sparse_model.predict(X_train))
        results["train_bal_acc"] += [train_bacc]
        results["train_abs_err"] += [metrics.mean_absolute_error(y_train, sparse_model.predict(X_train), sample_weight=weights_train)]
        
        # test scores
        results["test_accuracy"] += [metrics.accuracy_score(y_test, sparse_model.predict(X_test))]
        test_bacc = metrics.balanced_accuracy_score(y_test, sparse_model.predict(X_test))
        results["test_bal_acc"] += [test_bacc]
        results["test_abs_err"] += [metrics.mean_absolute_error(y_test, sparse_model.predict(X_test), sample_weight=weights_test)]
        
        # correlation
        results["spearman_corr"] += [spearmanr]
        results["pearson_corr"] += [pearsonr]
        
        # identification of significant genes
        TP = sum([g in anndata.var[anndata.var.Setting == "TS"].index for g in anndata.var[anndata.var.psupertime_weights.abs() != 0].index])
        FP = len(anndata.var[anndata.var.psupertime_weights.abs() != 0].index) - TP
        P = anndata.var[anndata.var.Setting == "TS"].shape[0]
        results["sensitivity"] += [TP / P]
        results["precision"] += [TP / (TP + FP) if TP + FP > 0 else 0]
        
        print("... dof:", dof,  "train_bacc:", train_bacc, "test_bacc", test_bacc, "spear_cor", spearmanr)
        
print("[*] Writing results")
# Write results to files
pd.DataFrame(results).to_csv(results_outfile)

# Write Genes and weights
with open(genes_outfile, "w") as f:
    for g in genes:#
        if (len(genes) == 0):
            f.write("\n\n")
        else:
            f.write(", ".join(g.abs().sort_values().index) + "\n")
            f.write(", ".join([str(el) for el in g.abs().sort_values()]) + "\n")

warnings.filterwarnings("always")


[*] Running Simulation
[*] Regularization Params =  [0.25       0.20347944 0.16561553 0.13479742 0.10971402 0.08929819
 0.07268138 0.05915667 0.04814866 0.03918905 0.03189666 0.02596126
 0.02113033 0.01719835 0.01399804 0.01139326 0.00927317 0.0075476
 0.00614313 0.005     ]
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.1_SS0.1.h5ad ...
... Iteration 0, Seed=4263
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 1, Seed=3019
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 2, Seed=4671
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 3, Seed=8428
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 4, Seed=5908
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.1_SS0.3.h5ad ...
... Iteration 0, Seed=6502
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 1, Seed=39
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 2, Seed=5424
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 3, Seed=8451
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 4, Seed=8865
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.1_SS0.5.h5ad ...
... Iteration 0, Seed=3825
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 1, Seed=6583
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 2, Seed=6345
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 3, Seed=6795
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 4, Seed=9128
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.1_SS0.7.h5ad ...
... Iteration 0, Seed=10
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 1, Seed=9293
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 2, Seed=8943
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 3, Seed=1660
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 4, Seed=4286
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.1_SS0.9.h5ad ...
... Iteration 0, Seed=4668
... dof: 2 train_bacc: 0.15676823975512816 test_bacc 0.1556776556776557 spear_cor 0.12388524513412778
... Iteration 1, Seed=6122
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 2, Seed=5738
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 3, Seed=6453
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 4, Seed=3785
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.3_SS0.1.h5ad ...
... Iteration 0, Seed=8723
... dof: 4534 train_bacc: 0.6576648671670215 test_bacc 0.2185592185592186 spear_cor 0.6309562612078256
... Iteration 1, Seed=5726
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 2, Seed=4241
... dof: 2631 train_bacc: 0.4936034690661148 test_bacc 0.23382173382173382 spear_cor 0.6501463037026822
... Iteration 3, Seed=7606
... dof: 15 train_bacc: 0.16106263419982642 test_bacc 0.13186813186813187 spear_cor 0.18031196233213856
... Iteration 4, Seed=7679
... dof: 3501 train_bacc: 0.5953500425700617 test_bacc 0.173992673992674 spear_cor 0.6415661340319748
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.3_SS0.3.h5ad ...
... Iteration 0, Seed=987
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 1, Seed=4438
... dof: 2 train_bacc: 0.14555256064690028 test_bacc 0.14285714285714285 spear_cor 0.4317280216909894
... Iteration 2, Seed=7005
... dof: 4935 train_bacc: 0.6075398652374019 test_bacc 0.23382173382173382 spear_cor 0.6650401061076582
... Iteration 3, Seed=1208
... dof: 1 train_bacc: 0.15539768833660744 test_bacc 0.19047619047619047 spear_cor 0.43466385485284725
... Iteration 4, Seed=5978
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.3_SS0.5.h5ad ...
... Iteration 0, Seed=831
... dof: 24 train_bacc: 0.1551692631001873 test_bacc 0.1556776556776557 spear_cor 0.008538964547137195
... Iteration 1, Seed=2136
... dof: 4859 train_bacc: 0.6532194891846625 test_bacc 0.163003663003663 spear_cor 0.6380938688434539
... Iteration 2, Seed=8446
... dof: 5356 train_bacc: 0.6583812364512551 test_bacc 0.21794871794871792 spear_cor 0.6657974894470872
... Iteration 3, Seed=6154
... dof: 7 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor 0.26097947512837055
... Iteration 4, Seed=5369
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.3_SS0.7.h5ad ...
... Iteration 0, Seed=1437
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 1, Seed=5582
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 2, Seed=343
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
... Iteration 3, Seed=9553
... dof: 3465 train_bacc: 0.4989282333474048 test_bacc 0.18681318681318682 spear_cor 0.6319761199086956
... Iteration 4, Seed=6631
Regularization: 20/20



... dof: 0 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor nan
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.5_SS0.1.h5ad ...
... Iteration 0, Seed=6950
... dof: 3 train_bacc: 0.261455525606469 test_bacc 0.2619047619047619 spear_cor 0.9107050528324951
... Iteration 1, Seed=7885
... dof: 1 train_bacc: 0.14285714285714285 test_bacc 0.14285714285714285 spear_cor 0.5622709559626939
... Iteration 2, Seed=3136
... dof: 3235 train_bacc: 0.6405113532230915 test_bacc 0.2606837606837607 spear_cor 0.6473437174497065
... Iteration 3, Seed=1506
... dof: 3341 train_bacc: 0.5839540147780448 test_bacc 0.22710622710622713 spear_cor 0.658509859442052
... Iteration 4, Seed=7884
... dof: 1 train_bacc: 0.14973274247338847 test_bacc 0.14285714285714285 spear_cor 0.47746015955695437
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_v2_TS0.5_SS0.3.h5ad ...
... Iteration 0, Seed=6283
... dof: 4 train_bacc: 0.2641509433962264 test_bacc 0.2619047619047619 sp

In [7]:
print(datetime.datetime.now())

2023-08-01 07:38:46.374224


**Roughly 6h Runtime for simulation with data restruturing**

**Roughly 3h Runtime for simulation with vanilla SGD**


In [None]:
df = pd.read_csv(results_outfile)

In [None]:
df.describe()

In [None]:
df.iloc[36]