In [1]:
import datetime
print(datetime.datetime.now())

2023-07-08 00:51:42.297144


In [2]:
import os
from sklearn import metrics
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from masterthesis.data import load_h5ad
from masterthesis.preprocessing import calculate_weights, transform_labels
from sklearn.model_selection import train_test_split
from masterthesis.model_selection import RegularizationGridSearch
from masterthesis.model import VanillaSGDBinarizedModel
from sklearn import metrics

import warnings

In [3]:
estimator_class = VanillaSGDBinarizedModel

# report files
genes_outfile = "genes_py_2.txt"
results_outfile = "results_py_2.txt"

# Fit params
n_seeds = 5
n_folds = 5
n_jobs = 4
n_reg_params = 20
reg_params = np.geomspace(0.25, 0.01, n_reg_params)
scoring = metrics.make_scorer(metrics.accuracy_score)

# Simulation data
data_dir = "/home/julian/Uni/MasterThesis/data"
filenames = [
    "simdata_TS0.1_SS0.1.h5ad", 
    "simdata_TS0.1_SS0.3.h5ad",
    "simdata_TS0.1_SS0.5.h5ad",
    "simdata_TS0.1_SS0.7.h5ad",
    "simdata_TS0.1_SS0.9.h5ad",
    "simdata_TS0.3_SS0.1.h5ad",
    "simdata_TS0.3_SS0.3.h5ad",
    "simdata_TS0.3_SS0.5.h5ad",
    "simdata_TS0.3_SS0.7.h5ad",
    "simdata_TS0.5_SS0.1.h5ad",
    "simdata_TS0.5_SS0.3.h5ad",
    "simdata_TS0.5_SS0.5.h5ad",
    "simdata_TS0.7_SS0.1.h5ad",
    "simdata_TS0.7_SS0.3.h5ad",
    "simdata_TS0.9_SS0.1.h5ad"
]

In [7]:
warnings.filterwarnings("once")

genes = []
results = {
    "file": [],
    "seed": [],
    "best_reg": [],
    "dof": [],
    "train_accuracy": [],
    "train_bal_acc": [],
    "train_abs_err": [],
    "test_accuracy": [],
    "test_bal_acc": [],
    "test_abs_err": [],
    "spearman_corr": [],
    "pearson_corr": [],
    "precision": [],
    "sensitivity": [],
}

print("[*] Running Simulation")
print("[*] Regularization Params = ", reg_params)

for f in filenames:
    simfile = os.path.join(data_dir, f)
    print("[*] Reading file %s ..." % simfile)
    anndata = load_h5ad(simfile)

    anndata.obs["ordinal_label"] = transform_labels(np.array([int(x) for x in anndata.obs.Ordinal_Time_Labels]))
    X_train, X_test, y_train, y_test = train_test_split(anndata.X, anndata.obs["ordinal_label"], 
                                                        test_size=0.1, 
                                                        stratify=anndata.obs["ordinal_label"],
                                                        random_state=1234)

    weights_train = calculate_weights(y_train)
    weights_test = calculate_weights(y_test)

    for i in range(n_seeds):

        seed = np.random.randint(9999)
        print("... Iteration %s, Seed=%s" % (i, seed))

        #print("... Cross Validation")
        sgd = RegularizationGridSearch(estimator=estimator_class,
                                       n_folds=n_folds,
                                       n_jobs=n_jobs,
                                       lambdas=reg_params,
                                       scoring=scoring)

        estimator_params = {"random_state": seed, "max_iter": 10, "early_stopping": True}
        fit_params = None #{"sample_weight": weights_train}
        sgd.fit(X_train, y_train, fit_params=fit_params, estimator_params=estimator_params)

        #print("... Refitting on training data")
        sparse_model = sgd.get_optimal_model("1se")
        sparse_model.fit(X_train, y_train)

        # genes weights
        anndata.var["psupertime_weights"] = sparse_model.coef_
        genes += [anndata.var.psupertime_weights[anndata.var.psupertime_weights != 0]]

        # calculate psupertime -> adds anndata.obs.psupertime
        sparse_model.predict_psuper(anndata)
        pearsonr = anndata.obs.Latent_Time.corr(anndata.obs.psupertime)
        spearmanr = anndata.obs.Latent_Time.corr(anndata.obs.psupertime, method='spearman')
        kendalltau = anndata.obs.Latent_Time.corr(anndata.obs.psupertime, method='kendall')

        results["file"] += [f]
        results["seed"] += [seed]
        results["best_reg"] += [sparse_model.regularization]
        results["dof"] += [len(np.nonzero(sparse_model.coef_)[0])]
        results["train_accuracy"] += [metrics.accuracy_score(y_train, sparse_model.predict(X_train))]
        results["train_bal_acc"] += [metrics.balanced_accuracy_score(y_train, sparse_model.predict(X_train))]
        results["train_abs_err"] += [metrics.mean_absolute_error(y_train, sparse_model.predict(X_train), sample_weight=weights_train)]
        results["test_accuracy"] += [metrics.accuracy_score(y_test, sparse_model.predict(X_test))]
        results["test_bal_acc"] += [metrics.balanced_accuracy_score(y_test, sparse_model.predict(X_test))]
        results["test_abs_err"] += [metrics.mean_absolute_error(y_test, sparse_model.predict(X_test), sample_weight=weights_test)]
        results["spearman_corr"] += [spearmanr]
        results["pearson_corr"] += [pearsonr]
        
        TP = sum([g in anndata.var[anndata.var.Setting == "TS"].index for g in anndata.var[anndata.var.psupertime_weights.abs() != 0].index])
        FP = len(anndata.var[anndata.var.psupertime_weights.abs() != 0].index) - TP
        P = anndata.var[anndata.var.Setting == "TS"].shape[0]
        results["sensitivity"] += [TP / P]
        results["precision"] += [TP / (TP + FP) if TP + FP > 0 else 0]

print("[*] Writing results")
# Write results to files
pd.DataFrame(results).to_csv(results_outfile)

# Write Genes and weights
with open(genes_outfile, "w") as f:
    for g in genes:#
        if (len(genes) == 0):
            f.write("\n\n")
        else:
            f.write(", ".join(g.abs().sort_values().index) + "\n")
            f.write(", ".join([str(el) for el ing.abs().sort_values()]) + "\n")

warnings.filterwarnings("always")


[*] Running Simulation
[*] Regularization Params =  [0.25       0.21103973 0.17815107 0.15038781 0.12695121 0.107167
 0.09046598 0.07636766 0.06446644 0.05441992 0.04593906 0.03877987
 0.03273637 0.0276347  0.02332808 0.0196926  0.01662369 0.01403303
 0.01184611 0.01      ]
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.1_SS0.1.h5ad ...
... Iteration 0, Seed=1508
Regularization: 20/20



... Iteration 1, Seed=5424
Regularization: 20/20



... Iteration 2, Seed=1097
Regularization: 20/20



... Iteration 3, Seed=3498
Regularization: 20/20



... Iteration 4, Seed=1166
Regularization: 20/20



[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.1_SS0.3.h5ad ...
... Iteration 0, Seed=584
Regularization: 20/20



... Iteration 1, Seed=3448
Regularization: 20/20



... Iteration 2, Seed=5887
Regularization: 20/20



... Iteration 3, Seed=8304
Regularization: 20/20



... Iteration 4, Seed=1468
Regularization: 20/20



[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.1_SS0.5.h5ad ...
... Iteration 0, Seed=8872
Regularization: 20/20



... Iteration 1, Seed=3445
Regularization: 20/20



... Iteration 2, Seed=6291
Regularization: 20/20



... Iteration 3, Seed=9530
Regularization: 20/20



... Iteration 4, Seed=6119
Regularization: 20/20



[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.1_SS0.7.h5ad ...
... Iteration 0, Seed=2259
Regularization: 20/20



... Iteration 1, Seed=2148
Regularization: 20/20



... Iteration 2, Seed=8756
Regularization: 20/20



... Iteration 3, Seed=2217
... Iteration 4, Seed=6560
Regularization: 20/20



[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.1_SS0.9.h5ad ...
... Iteration 0, Seed=4752
Regularization: 20/20



... Iteration 1, Seed=5455
Regularization: 20/20



... Iteration 2, Seed=2941
... Iteration 3, Seed=9757
Regularization: 20/20



... Iteration 4, Seed=5896
Regularization: 20/20



[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.3_SS0.1.h5ad ...
... Iteration 0, Seed=8932
Regularization: 20/20



... Iteration 1, Seed=9498
... Iteration 2, Seed=5176
Regularization: 20/20



... Iteration 3, Seed=5723
Regularization: 20/20



... Iteration 4, Seed=4067
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.3_SS0.3.h5ad ...
... Iteration 0, Seed=329
Regularization: 20/20



... Iteration 1, Seed=5652
Regularization: 20/20



... Iteration 2, Seed=4375
... Iteration 3, Seed=5010
Regularization: 20/20



... Iteration 4, Seed=1176
Regularization: 20/20



[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.3_SS0.5.h5ad ...
... Iteration 0, Seed=9248
Regularization: 20/20



... Iteration 1, Seed=1220
Regularization: 20/20



... Iteration 2, Seed=9016
Regularization: 20/20



... Iteration 3, Seed=5400
Regularization: 20/20



... Iteration 4, Seed=8737
Regularization: 20/20



[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.3_SS0.7.h5ad ...
... Iteration 0, Seed=5760
Regularization: 20/20



... Iteration 1, Seed=7244
Regularization: 20/20



... Iteration 2, Seed=8679
Regularization: 20/20



... Iteration 3, Seed=8783
Regularization: 20/20



... Iteration 4, Seed=5149
Regularization: 20/20



[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.5_SS0.1.h5ad ...
... Iteration 0, Seed=4389
... Iteration 1, Seed=9542
... Iteration 2, Seed=4419
... Iteration 3, Seed=1066
... Iteration 4, Seed=1087
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.5_SS0.3.h5ad ...
... Iteration 0, Seed=3491
... Iteration 1, Seed=4090
... Iteration 2, Seed=9400
... Iteration 3, Seed=5160
... Iteration 4, Seed=9622
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.5_SS0.5.h5ad ...
... Iteration 0, Seed=4124
... Iteration 1, Seed=5232
... Iteration 2, Seed=983
... Iteration 3, Seed=9759
... Iteration 4, Seed=2678
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.7_SS0.1.h5ad ...
... Iteration 0, Seed=2530
... Iteration 1, Seed=3137
... Iteration 2, Seed=9880
... Iteration 3, Seed=1839
... Iteration 4, Seed=6912
[*] Reading file /home/julian/Uni/MasterThesis/data/simdata_TS0.7_SS0.3.h5ad ...
... Iteration 0, Seed=7918
... Iteration 1, Seed=2651
..

TypeError: sequence item 0: expected str instance, float found

In [12]:
# Write Genes and weights
with open(genes_outfile, "w") as f:
    for g in genes:#
        if (len(genes) == 0):
            f.write("\n\n")
        else:
            f.write(", ".join(g.abs().sort_values().index) + "\n")
            f.write(", ".join([str(el) for el in g.abs().sort_values()]) + "\n")


In [None]:
import datetime
print(datetime.datetime.now())

**Roughly 6h Runtime for simulation with data restruturing**

**Roughly 3h Runtime for simulation with vanilla SGD**


In [8]:
df = pd.read_csv(results_outfile)

In [9]:
df.describe()

Unnamed: 0.1,Unnamed: 0,seed,best_reg,dof,train_accuracy,train_bal_acc,train_abs_err,test_accuracy,test_bal_acc,test_abs_err,spearman_corr,pearson_corr,precision,sensitivity
count,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,75.0,35.0,35.0,75.0,75.0
mean,37.0,5116.773333,0.17894,46.893333,0.265502,0.192401,2.608012,0.257701,0.190466,2.601556,0.773685,0.788735,0.404534,0.000359
std,21.794495,2878.035627,0.073166,327.839635,0.055298,0.070436,0.528152,0.049655,0.063519,0.537877,0.219991,0.246192,0.485023,0.002219
min,0.0,329.0,0.014033,0.0,0.225243,0.142857,0.668031,0.172414,0.142857,1.079365,0.056048,0.059418,0.0,0.0
25%,18.5,2809.5,0.107167,0.0,0.229126,0.142857,2.151918,0.224138,0.142857,2.142857,0.72264,0.769229,0.0,0.0
50%,37.0,5149.0,0.178151,0.0,0.229126,0.142857,3.0,0.224138,0.142857,3.0,0.838394,0.877488,0.0,0.0
75%,55.5,7581.0,0.25,3.0,0.300971,0.244072,3.0,0.301724,0.25,3.0,0.914889,0.944503,1.0,0.000203
max,74.0,9880.0,0.25,2780.0,0.557282,0.531876,3.0,0.431034,0.367347,3.0,0.998436,0.998192,1.0,0.019284


In [13]:
df.iloc[36]

Unnamed: 0                              36
file              simdata_TS0.3_SS0.5.h5ad
seed                                  1220
best_reg                              0.25
dof                                      0
train_accuracy                    0.229126
train_bal_acc                     0.142857
train_abs_err                          3.0
test_accuracy                     0.224138
test_bal_acc                      0.142857
test_abs_err                           3.0
spearman_corr                          NaN
pearson_corr                           NaN
precision                              0.0
sensitivity                            0.0
Name: 36, dtype: object