In [1]:
%load_ext autoreload
%autoreload 2

from src.base_classes.omic_data_loader import OmicDataLoader
from src.data_managers.concat import CatOmicDataManager

In [3]:
mrna_loader = OmicDataLoader(
    data_dir="mds_data/splits_74/mrna",
)
mirna_loader = OmicDataLoader(
    data_dir="mds_data/splits_74/mirna_genes",
)
circrna_loader = OmicDataLoader(
    data_dir="mds_data/splits_74/circrna",
)
  
te_loader = OmicDataLoader(
    data_dir="mds_data/splits_74/te_counts",
)

In [4]:
for fold_idx in range(5):
    train_df, test_df = mrna_loader.get_fold(fold_idx)

    print("fold: ", fold_idx)
    print(train_df["class"].value_counts(), test_df["class"].value_counts())

fold:  0
shape: (2, 2)
┌───────┬───────┐
│ class ┆ count │
│ ---   ┆ ---   │
│ i64   ┆ u32   │
╞═══════╪═══════╡
│ 0     ┆ 10    │
│ 1     ┆ 49    │
└───────┴───────┘ shape: (2, 2)
┌───────┬───────┐
│ class ┆ count │
│ ---   ┆ ---   │
│ i64   ┆ u32   │
╞═══════╪═══════╡
│ 0     ┆ 3     │
│ 1     ┆ 12    │
└───────┴───────┘
fold:  1
shape: (2, 2)
┌───────┬───────┐
│ class ┆ count │
│ ---   ┆ ---   │
│ i64   ┆ u32   │
╞═══════╪═══════╡
│ 0     ┆ 10    │
│ 1     ┆ 49    │
└───────┴───────┘ shape: (2, 2)
┌───────┬───────┐
│ class ┆ count │
│ ---   ┆ ---   │
│ i64   ┆ u32   │
╞═══════╪═══════╡
│ 0     ┆ 3     │
│ 1     ┆ 12    │
└───────┴───────┘
fold:  2
shape: (2, 2)
┌───────┬───────┐
│ class ┆ count │
│ ---   ┆ ---   │
│ i64   ┆ u32   │
╞═══════╪═══════╡
│ 0     ┆ 10    │
│ 1     ┆ 49    │
└───────┴───────┘ shape: (2, 2)
┌───────┬───────┐
│ class ┆ count │
│ ---   ┆ ---   │
│ i64   ┆ u32   │
╞═══════╪═══════╡
│ 0     ┆ 3     │
│ 1     ┆ 12    │
└───────┴───────┘
fold:  3
shape: (2, 2)
┌─

In [194]:
omic_data_loaders = {
    "mrna": mrna_loader,
    "mirna": mirna_loader,
    # "circrna": circrna_loader,
    # "pirna": pirna_loader,
    # "te": te_loader,
}
odm = CatOmicDataManager(omic_data_loaders, n_splits=5)

In [27]:
odm.feature_dim, odm.n_classes

(200, 2)

In [None]:
from src.evals.knn import KNNEvaluator

knn_eval = KNNEvaluator(
    data_manager=odm,
    n_trials=20,
    verbose=True,
    params={"k_lb": 1, "k_ub": 20},
)
_ = knn_eval.evaluate()
knn_eval.print_best_results()

[I 2024-11-10 19:15:05,063] A new study created in memory with name: no-name-487dc805-fc43-418d-8a2d-a156c69f7612
[I 2024-11-10 19:15:05,137] Trial 0 finished with value: 0.7155098095140549 and parameters: {'n_neighbors': 16}. Best is trial 0 with value: 0.7155098095140549.
[I 2024-11-10 19:15:05,204] Trial 1 finished with value: 0.6804361055090542 and parameters: {'n_neighbors': 12}. Best is trial 0 with value: 0.7155098095140549.
[I 2024-11-10 19:15:05,288] Trial 2 finished with value: 0.7155098095140549 and parameters: {'n_neighbors': 9}. Best is trial 0 with value: 0.7155098095140549.


New best score: 0.716
Best model performance:
Accuracy: 0.919 ± 0.050
F1 Macro: 0.852 ± 0.097
F1 Weighted: 0.913 ± 0.056


[I 2024-11-10 19:15:05,353] Trial 3 finished with value: 0.7155098095140549 and parameters: {'n_neighbors': 17}. Best is trial 0 with value: 0.7155098095140549.
[I 2024-11-10 19:15:05,421] Trial 4 finished with value: 0.7155098095140549 and parameters: {'n_neighbors': 16}. Best is trial 0 with value: 0.7155098095140549.
[I 2024-11-10 19:15:05,489] Trial 5 finished with value: 0.7155098095140549 and parameters: {'n_neighbors': 17}. Best is trial 0 with value: 0.7155098095140549.
[I 2024-11-10 19:15:05,552] Trial 6 finished with value: 0.58024233114353 and parameters: {'n_neighbors': 6}. Best is trial 0 with value: 0.7155098095140549.
[I 2024-11-10 19:15:05,615] Trial 7 finished with value: 0.7155098095140549 and parameters: {'n_neighbors': 16}. Best is trial 0 with value: 0.7155098095140549.
[I 2024-11-10 19:15:05,682] Trial 8 finished with value: 0.6804361055090542 and parameters: {'n_neighbors': 12}. Best is trial 0 with value: 0.7155098095140549.
[I 2024-11-10 19:15:05,747] Trial 9 f

Best hyperparameters:
{'n_neighbors': 16}
Best model performance:
Accuracy: 0.919 ± 0.050
F1 Macro: 0.852 ± 0.097
F1 Weighted: 0.913 ± 0.056


In [None]:
from src.evals.svm import SVMEvaluator

svm_eval = SVMEvaluator(
    data_manager=odm,
    n_trials=30,
    verbose=True,
    params={
        "C_lb": 0.01,
        "C_ub": 10,
        "rfe_step_range": (0.05, 0.2),
        "rfe_n_features_range": (100, 200),
    },
    mode="linear",
)
_ = svm_eval.evaluate()
svm_eval.print_best_results()

[I 2024-11-10 15:04:57,374] A new study created in memory with name: no-name-17b65a7d-695e-4e4c-9283-d9a803e51944


[I 2024-11-10 15:04:57,465] Trial 0 finished with value: 0.6497992012967035 and parameters: {'C': 0.5912776084859577, 'class_weight': None, 'rfe_step': 0.18135798798684793, 'rfe_n_features': 150}. Best is trial 0 with value: 0.6497992012967035.
[I 2024-11-10 15:04:57,574] Trial 1 finished with value: 0.6497992012967035 and parameters: {'C': 1.92761937700467, 'class_weight': 'balanced', 'rfe_step': 0.12740681956658, 'rfe_n_features': 111}. Best is trial 0 with value: 0.6497992012967035.
[I 2024-11-10 15:04:57,656] Trial 2 finished with value: 0.6477728936997709 and parameters: {'C': 0.04431683340011391, 'class_weight': 'balanced', 'rfe_step': 0.0561795144594292, 'rfe_n_features': 170}. Best is trial 0 with value: 0.6497992012967035.


New best score: 0.650
Best model performance:
Accuracy: 0.892 ± 0.053
F1 Macro: 0.814 ± 0.093
F1 Weighted: 0.895 ± 0.046


[I 2024-11-10 15:04:57,742] Trial 3 finished with value: 0.5947745825537253 and parameters: {'C': 6.239037597620005, 'class_weight': None, 'rfe_step': 0.16137130043179504, 'rfe_n_features': 150}. Best is trial 0 with value: 0.6497992012967035.
[I 2024-11-10 15:04:57,804] Trial 4 finished with value: 0.6497992012967035 and parameters: {'C': 1.8024991990621209, 'class_weight': None, 'rfe_step': 0.16636180097742892, 'rfe_n_features': 200}. Best is trial 0 with value: 0.6497992012967035.
[I 2024-11-10 15:04:57,914] Trial 5 finished with value: 0.5947745825537253 and parameters: {'C': 5.247154408538435, 'class_weight': None, 'rfe_step': 0.06264027199095756, 'rfe_n_features': 156}. Best is trial 0 with value: 0.6497992012967035.
[I 2024-11-10 15:04:58,008] Trial 6 finished with value: 0.5947745825537253 and parameters: {'C': 9.321484902226024, 'class_weight': None, 'rfe_step': 0.16378265009504533, 'rfe_n_features': 154}. Best is trial 0 with value: 0.6497992012967035.
[I 2024-11-10 15:04:58,

New best score: 0.679
Best model performance:
Accuracy: 0.906 ± 0.032
F1 Macro: 0.828 ± 0.068
F1 Weighted: 0.905 ± 0.031


[I 2024-11-10 15:04:58,755] Trial 14 finished with value: 0.6787024344579722 and parameters: {'C': 0.15592096415861068, 'class_weight': 'balanced', 'rfe_step': 0.0936740719697733, 'rfe_n_features': 129}. Best is trial 12 with value: 0.6787024344579722.
[I 2024-11-10 15:04:58,870] Trial 15 finished with value: 0.7331794285989254 and parameters: {'C': 0.10243547401908666, 'class_weight': 'balanced', 'rfe_step': 0.09295207471213159, 'rfe_n_features': 131}. Best is trial 15 with value: 0.7331794285989254.
[I 2024-11-10 15:04:58,960] Trial 16 finished with value: 0.6809510924096367 and parameters: {'C': 0.050080573100068776, 'class_weight': 'balanced', 'rfe_step': 0.1057608612630386, 'rfe_n_features': 120}. Best is trial 15 with value: 0.7331794285989254.
[I 2024-11-10 15:04:59,068] Trial 17 finished with value: 0.6422617001076846 and parameters: {'C': 0.07668997260540834, 'class_weight': 'balanced', 'rfe_step': 0.0832177373698388, 'rfe_n_features': 114}. Best is trial 15 with value: 0.7331

New best score: 0.733
Best model performance:
Accuracy: 0.918 ± 0.030
F1 Macro: 0.867 ± 0.041
F1 Weighted: 0.921 ± 0.024


[I 2024-11-10 15:04:59,147] Trial 18 finished with value: 0.6477728936997709 and parameters: {'C': 0.010268450364919926, 'class_weight': 'balanced', 'rfe_step': 0.11357879132217108, 'rfe_n_features': 139}. Best is trial 15 with value: 0.7331794285989254.
[I 2024-11-10 15:04:59,238] Trial 19 finished with value: 0.6809510924096367 and parameters: {'C': 0.0644825018841236, 'class_weight': 'balanced', 'rfe_step': 0.14146501839591458, 'rfe_n_features': 116}. Best is trial 15 with value: 0.7331794285989254.
[I 2024-11-10 15:04:59,340] Trial 20 finished with value: 0.728115464597245 and parameters: {'C': 0.10675039358903013, 'class_weight': 'balanced', 'rfe_step': 0.07651257359386716, 'rfe_n_features': 139}. Best is trial 15 with value: 0.7331794285989254.
[I 2024-11-10 15:04:59,445] Trial 21 finished with value: 0.728115464597245 and parameters: {'C': 0.1155640352564498, 'class_weight': 'balanced', 'rfe_step': 0.07635708884773021, 'rfe_n_features': 138}. Best is trial 15 with value: 0.73317

Best hyperparameters:
{'C': 0.10243547401908666, 'class_weight': 'balanced', 'rfe_step': 0.09295207471213159, 'rfe_n_features': 131}
Best model performance:
Accuracy: 0.918 ± 0.030
F1 Macro: 0.867 ± 0.041
F1 Weighted: 0.921 ± 0.024


Best model performance:
Accuracy: 0.938 ± 0.058
F1 Macro: 0.684 ± 0.258
F1 Weighted: 0.924 ± 0.064

In [195]:
from src.evals.xgboost import XGBoostEvaluator

xgb_eval = XGBoostEvaluator(
    data_manager=odm,
    n_trials=60,
    verbose=True,
)
_ = xgb_eval.evaluate()

[I 2024-11-12 23:57:48,987] A new study created in memory with name: no-name-078ffa6d-2c7a-496a-b139-dae54416a30c
[I 2024-11-12 23:57:49,181] Trial 0 finished with value: 0.5036718571125454 and parameters: {'booster': 'gbtree', 'lambda': 0.2012630144698972, 'alpha': 4.341615091163527e-08, 'max_depth': 3, 'eta': 2.332756805332267e-06, 'gamma': 1.2412516726806618e-06, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.5036718571125454.
[I 2024-11-12 23:57:49,288] Trial 1 finished with value: 0.6219707163863691 and parameters: {'booster': 'gblinear', 'lambda': 8.646955492392989e-07, 'alpha': 1.5506307885311386e-07}. Best is trial 1 with value: 0.6219707163863691.


New best score: 0.504
Best model performance:
Accuracy: 0.825 ± 0.123
F1 Macro: 0.729 ± 0.187
F1 Weighted: 0.838 ± 0.104
New best score: 0.622
Best model performance:
Accuracy: 0.879 ± 0.049
F1 Macro: 0.804 ± 0.088
F1 Weighted: 0.880 ± 0.056


[I 2024-11-12 23:57:49,497] Trial 2 finished with value: 0.5325885489025592 and parameters: {'booster': 'dart', 'lambda': 2.5211337235291564e-07, 'alpha': 0.0007655003241959716, 'max_depth': 3, 'eta': 3.1708961722098076e-07, 'gamma': 0.6434137367112158, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 2.4656097396901352e-08, 'skip_drop': 1.394809407147541e-05}. Best is trial 1 with value: 0.6219707163863691.
[I 2024-11-12 23:57:49,604] Trial 3 finished with value: 0.5652269064579075 and parameters: {'booster': 'gblinear', 'lambda': 9.852933249626684e-05, 'alpha': 0.0011541940286663782}. Best is trial 1 with value: 0.6219707163863691.
[I 2024-11-12 23:57:49,784] Trial 4 finished with value: 0.5844534524426352 and parameters: {'booster': 'gbtree', 'lambda': 6.894009654657362e-05, 'alpha': 0.013098965460206181, 'max_depth': 4, 'eta': 3.221944198197507e-05, 'gamma': 2.1300756208798475e-07, 'grow_policy': 'lossguide'}. Best is trial 1 with value:

New best score: 0.652
Best model performance:
Accuracy: 0.892 ± 0.053
F1 Macro: 0.819 ± 0.098
F1 Weighted: 0.892 ± 0.060


[I 2024-11-12 23:57:51,277] Trial 15 finished with value: 0.5868005745549648 and parameters: {'booster': 'gblinear', 'lambda': 1.9861036182191637e-07, 'alpha': 1.5363778417735748e-08}. Best is trial 13 with value: 0.6522278748969422.
[I 2024-11-12 23:57:51,498] Trial 16 finished with value: 0.5675043225090064 and parameters: {'booster': 'dart', 'lambda': 9.193201467048045e-08, 'alpha': 1.1675521747082847e-08, 'max_depth': 9, 'eta': 0.13753571594079478, 'gamma': 0.0009921462222416356, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.9079710789742155, 'skip_drop': 0.45513656784906026}. Best is trial 13 with value: 0.6522278748969422.
[I 2024-11-12 23:57:51,609] Trial 17 finished with value: 0.6119104221152568 and parameters: {'booster': 'gblinear', 'lambda': 2.706796288224791e-05, 'alpha': 2.5731722360039313e-05}. Best is trial 13 with value: 0.6522278748969422.
[I 2024-11-12 23:57:51,725] Trial 18 finished with value: 0.5575009739708995 a

New best score: 0.676
Best model performance:
Accuracy: 0.906 ± 0.032
F1 Macro: 0.828 ± 0.068
F1 Weighted: 0.901 ± 0.041


[I 2024-11-12 23:57:52,063] Trial 20 finished with value: 0.48162878238196427 and parameters: {'booster': 'dart', 'lambda': 0.0013594253713011379, 'alpha': 1.0427442190553258e-06, 'max_depth': 7, 'eta': 0.008224588783981643, 'gamma': 0.002546554304199964, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 8.656537424825594e-07, 'skip_drop': 1.5164876825299935e-08}. Best is trial 19 with value: 0.6760709742693581.
[I 2024-11-12 23:57:52,175] Trial 21 finished with value: 0.6760709742693581 and parameters: {'booster': 'gblinear', 'lambda': 0.0008033446327792338, 'alpha': 3.3395862525915787e-06}. Best is trial 19 with value: 0.6760709742693581.
[I 2024-11-12 23:57:52,290] Trial 22 finished with value: 0.6760709742693581 and parameters: {'booster': 'gblinear', 'lambda': 0.00096864069113249, 'alpha': 3.8063602619995724e-06}. Best is trial 19 with value: 0.6760709742693581.
[I 2024-11-12 23:57:52,420] Trial 23 finished with value: 0.676070974269358

New best score: 0.717
Best model performance:
Accuracy: 0.920 ± 0.050
F1 Macro: 0.853 ± 0.097
F1 Weighted: 0.914 ± 0.057


[I 2024-11-12 23:57:53,080] Trial 28 finished with value: 0.5591177623513526 and parameters: {'booster': 'dart', 'lambda': 0.8723166811962394, 'alpha': 6.643695005025739e-07, 'max_depth': 7, 'eta': 0.0006642969678843825, 'gamma': 1.141093687755603e-08, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.0003347653951284447, 'skip_drop': 5.2559008104433786e-08}. Best is trial 26 with value: 0.7172503758053912.
[I 2024-11-12 23:57:53,213] Trial 29 finished with value: 0.7172503758053912 and parameters: {'booster': 'gblinear', 'lambda': 0.04065379049808371, 'alpha': 2.641676480443951e-05}. Best is trial 26 with value: 0.7172503758053912.
[I 2024-11-12 23:57:53,349] Trial 30 finished with value: 0.5297617257950828 and parameters: {'booster': 'gbtree', 'lambda': 0.009681052052635123, 'alpha': 2.5948563926624134e-05, 'max_depth': 1, 'eta': 0.609774250373914, 'gamma': 0.004724513014616042, 'grow_policy': 'depthwise'}. Best is trial 26 with value: 0.7

New best score: 0.758
Best model performance:
Accuracy: 0.933 ± 0.060
F1 Macro: 0.877 ± 0.114
F1 Weighted: 0.927 ± 0.067


[I 2024-11-12 23:57:53,807] Trial 34 finished with value: 0.7581635155117249 and parameters: {'booster': 'gblinear', 'lambda': 0.29095253999994614, 'alpha': 5.459898539694326e-08}. Best is trial 32 with value: 0.7581635155117249.
[I 2024-11-12 23:57:53,920] Trial 35 finished with value: 0.7039337563024542 and parameters: {'booster': 'gblinear', 'lambda': 0.5839108883101183, 'alpha': 2.947158047877257e-08}. Best is trial 32 with value: 0.7581635155117249.
[I 2024-11-12 23:57:54,056] Trial 36 finished with value: 0.7581635155117249 and parameters: {'booster': 'gblinear', 'lambda': 0.19041924107375766, 'alpha': 5.650857249421197e-08}. Best is trial 32 with value: 0.7581635155117249.
[I 2024-11-12 23:57:54,251] Trial 37 finished with value: 0.5325885489025592 and parameters: {'booster': 'gbtree', 'lambda': 0.1825115817608083, 'alpha': 5.490275888289266e-08, 'max_depth': 6, 'eta': 2.9138188559070594e-08, 'gamma': 3.665815759431605e-08, 'grow_policy': 'lossguide'}. Best is trial 32 with valu

In [196]:
xgb_eval.print_best_results()

Best model performance:
Accuracy: 0.933 ± 0.060
F1 Macro: 0.877 ± 0.114
F1 Weighted: 0.927 ± 0.067


In [40]:
from src.evals.mlp import MLPEvaluator

mlp_eval = MLPEvaluator(
    data_manager=odm,
    n_trials=10,
    verbose=True,
    params={
        "lr_range": [1e-4, 1e-2],
        "l2_lambda": 5e-4,
        "dropout_range": [0.1, 0.5],
        "hidden_channels": 64,
        "proj_dim": 64,
        "batch_size": 32,
        "max_epochs": 50,
    },
)
_ = mlp_eval.evaluate()

[I 2024-11-12 19:38:31,736] A new study created in memory with name: no-name-fb818b66-fdee-40b1-8f69-a558bed153e5
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
`Trainer.fit` stopped: `max_epochs=50` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using:

New best score: 0.581
Best model performance:
Accuracy: 0.892 ± 0.032
F1 Macro: 0.745 ± 0.154
F1 Weighted: 0.874 ± 0.050


`Trainer.fit` stopped: `max_epochs=50` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
`Trainer.fit` stopped: `max_epochs=50` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-pa

New best score: 0.663
Best model performance:
Accuracy: 0.906 ± 0.032
F1 Macro: 0.815 ± 0.062
F1 Weighted: 0.898 ± 0.038


`Trainer.fit` stopped: `max_epochs=50` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
`Trainer.fit` stopped: `max_epochs=50` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-pa

New best score: 0.710
Best model performance:
Accuracy: 0.919 ± 0.026
F1 Macro: 0.846 ± 0.067
F1 Weighted: 0.913 ± 0.038


`Trainer.fit` stopped: `max_epochs=50` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
`Trainer.fit` stopped: `max_epochs=50` reached.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/lubojjan/DiplomaGeneral/.venv/lib/python3.12/site-pa

In [41]:
mlp_eval.print_best_results()
mlp_eval.print_best_parameters()

Best model performance:
Accuracy: 0.919 ± 0.026
F1 Macro: 0.846 ± 0.067
F1 Weighted: 0.913 ± 0.038
Best hyperparameters:
{'lr': 0.007039917108469391, 'dropout': 0.44376947156025015}


In [92]:
knn_eval.save_results(results_file="logs/mds_disese_eval.csv", row_name="knn")
svm_eval.save_results(results_file="logs/mds_disese_eval.csv", row_name="svm")
xgb_eval.save_results(results_file="logs/mds_disese_eval.csv", row_name="xgb")

In [127]:
knn_eval.print_best_results()
svm_eval.print_best_results()
xgb_eval.print_best_results()

Best model performance:
Accuracy: 0.923 ± 0.084
F1 Macro: 0.825 ± 0.208
F1 Weighted: 0.933 ± 0.072
Best model performance:
Accuracy: 0.954 ± 0.038
F1 Macro: 0.688 ± 0.255
F1 Weighted: 0.932 ± 0.056
Best model performance:
Accuracy: 0.954 ± 0.038
F1 Macro: 0.688 ± 0.255
F1 Weighted: 0.932 ± 0.056


In [82]:
mrna_loader.get_fold(0)

(shape: (59, 202)
 ┌────────────┬────────────┬────────────┬───────────┬───┬───────────┬───────────┬───────────┬───────┐
 │ ENSG000001 ┆ ENSG000002 ┆ ENSG000001 ┆ ENSG00000 ┆ … ┆ ENSG00000 ┆ ENSG00000 ┆ sample_id ┆ class │
 │ 81826      ┆ 78588      ┆ 20594      ┆ 121797    ┆   ┆ 130821    ┆ 172159    ┆ s         ┆ ---   │
 │ ---        ┆ ---        ┆ ---        ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ i64   │
 │ f64        ┆ f64        ┆ f64        ┆ f64       ┆   ┆ f64       ┆ f64       ┆ str       ┆       │
 ╞════════════╪════════════╪════════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════╡
 │ 0.190054   ┆ 0.8589     ┆ 0.724728   ┆ 0.796341  ┆ … ┆ 0.235535  ┆ 0.200897  ┆ N54       ┆ 0     │
 │ 0.131677   ┆ 0.788182   ┆ 0.73847    ┆ 0.566229  ┆ … ┆ 0.461213  ┆ 0.061081  ┆ N58       ┆ 0     │
 │ 0.94058    ┆ 0.623225   ┆ 0.735662   ┆ 0.41357   ┆ … ┆ 0.114601  ┆ 0.226098  ┆ N82       ┆ 0     │
 │ 0.805491   ┆ 0.642498   ┆ 0.62087    ┆ 0.43411   ┆ … ┆ 0.0   

In [87]:
odm.get_split(0)

(shape: (59, 800)
 ┌────────────┬────────────┬────────────┬───────────┬───┬──────────┬──────────┬──────────┬──────────┐
 │ ENSG000001 ┆ ENSG000002 ┆ ENSG000001 ┆ ENSG00000 ┆ … ┆ L1M3DE_5 ┆ HERV19I  ┆ LTR40C   ┆ L1MA8    │
 │ 81826      ┆ 78588      ┆ 20594      ┆ 121797    ┆   ┆ ---      ┆ ---      ┆ ---      ┆ ---      │
 │ ---        ┆ ---        ┆ ---        ┆ ---       ┆   ┆ f64      ┆ f64      ┆ f64      ┆ f64      │
 │ f64        ┆ f64        ┆ f64        ┆ f64       ┆   ┆          ┆          ┆          ┆          │
 ╞════════════╪════════════╪════════════╪═══════════╪═══╪══════════╪══════════╪══════════╪══════════╡
 │ 0.190054   ┆ 0.8589     ┆ 0.724728   ┆ 0.796341  ┆ … ┆ 0.770044 ┆ 0.741842 ┆ 0.767158 ┆ 0.87709  │
 │ 0.131677   ┆ 0.788182   ┆ 0.73847    ┆ 0.566229  ┆ … ┆ 0.628628 ┆ 0.630266 ┆ 0.27247  ┆ 0.548661 │
 │ 0.94058    ┆ 0.623225   ┆ 0.735662   ┆ 0.41357   ┆ … ┆ 0.417187 ┆ 0.635949 ┆ 0.342004 ┆ 0.649414 │
 │ 0.805491   ┆ 0.642498   ┆ 0.62087    ┆ 0.43411   ┆ … ┆ 0.7912

In [None]:
catodm = CatOmicDataManager(
    omic_data_loaders={
        "mrna": mrna_loader,
        # "meth": meth_loader,
        "mirna": mirna_loader,
    },
    n_splits=5,
)

catodm.get_split(0)

(shape: (59, 400)
 ┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
 │ ENSG00000 ┆ ENSG00000 ┆ ENSG00000 ┆ ENSG00000 ┆ … ┆ ENSG00000 ┆ ENSG00000 ┆ ENSG00000 ┆ ENSG0000 │
 │ 181826    ┆ 278588    ┆ 120594    ┆ 121797    ┆   ┆ 276404    ┆ 207820    ┆ 252695    ┆ 0263831  │
 │ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
 │ f64       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │
 ╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
 │ 0.190054  ┆ 0.8589    ┆ 0.724728  ┆ 0.796341  ┆ … ┆ 0.792052  ┆ 0.724515  ┆ 0.670447  ┆ 0.877857 │
 │ 0.131677  ┆ 0.788182  ┆ 0.73847   ┆ 0.566229  ┆ … ┆ 0.570109  ┆ 0.788338  ┆ 0.25199   ┆ 0.445283 │
 │ 0.94058   ┆ 0.623225  ┆ 0.735662  ┆ 0.41357   ┆ … ┆ 0.39374   ┆ 0.692776  ┆ 0.478255  ┆ 0.630045 │
 │ 0.805491  ┆ 0.642498  ┆ 0.62087   ┆ 0.43411   ┆ … ┆ 0.656948 

In [198]:
from src.evals.mogonet import MOGONETEvaluator
from src.data_managers.sample_graph import SampleGraphDataManager

mogonet_eval = MOGONETEvaluator(
    data_manager=SampleGraphDataManager(
        omic_data_loaders={
            "mrna": mrna_loader,
            "mirna": mirna_loader,
        },
        n_splits=5,
        params={
            "graph_style": "threshold",
            "self_connections": True,
            "avg_degree": 5,
        },
    ),
    n_trials=1,
    params={
        "encoder_hidden_channels": {
            "mrna": 64,
            "mirna": 64,
        },
        "encoder_type": "gat",
        "dropout": 0.2,
        "integrator_type": "attention",
        "integration_in_dim": 16,
        "vcdn_hidden_channels": 16,
        "epochs": 100,
        "log_interval": 101,
    }
)
mogonet_eval.evaluate()

[I 2024-11-12 23:58:27,453] A new study created in memory with name: no-name-88f2a81c-17ae-464a-9309-9250ee9deaed
[I 2024-11-12 23:58:31,864] Trial 0 finished with value: 0.8225991965179064 and parameters: {}. Best is trial 0 with value: 0.8225991965179064.


New best score: 0.823
Best model performance:
Accuracy: 0.947 ± 0.050
F1 Macro: 0.918 ± 0.070
F1 Weighted: 0.947 ± 0.047


{'acc': np.float64(0.9466666666666667),
 'f1_macro': np.float64(0.917909090909091),
 'f1_weighted': np.float64(0.9466545454545454),
 'acc_std': np.float64(0.04988876515698587),
 'f1_macro_std': np.float64(0.06951282657072173),
 'f1_weighted_std': np.float64(0.047331376205998074)}

In [24]:
from src.data_managers.bipartite_graph import BipartiteGraphDataManager
from torch_geometric.transforms import ToUndirected

bpdm = BipartiteGraphDataManager(
    omic_data_loaders={
        "mrna": mrna_loader,
        "mirna": mirna_loader,
    },
    n_splits=5,
    params={
        "diff_exp_thresholds" : {
            "mrna": 1.8,
            "mirna": 1.8,
        }
    },
)
data, _, _, _ = bpdm.get_split(0)
# params={
#     "graph_style": "threshold",
#     "self_connections": True,
data

torch.Size([200]) torch.Size([200])
isolated sample nodes, isolated gene nodes, mean degree: 
tensor(5) tensor(1) tensor(12.9189)
torch.Size([200]) torch.Size([200])
isolated sample nodes, isolated gene nodes, mean degree: 
tensor(4) tensor(0) tensor(12.7027)


HeteroData(
  feature_names=[2],
  omics=[2],
  num_relations=6,
  y=[74],
  train_mask=[74],
  test_mask=[74],
  val_mask=[74],
  mrna={ x=[74, 200] },
  mrna_feature={ x=[200, 200] },
  mirna={ x=[74, 200] },
  mirna_feature={ x=[200, 200] },
  (mrna, diff_exp, mrna_feature)={ edge_index=[2, 956] },
  (mirna, diff_exp, mirna_feature)={ edge_index=[2, 940] },
  (mrna_feature, rev_diff_exp, mrna)={ edge_index=[2, 956] },
  (mirna_feature, rev_diff_exp, mirna)={ edge_index=[2, 940] },
  (mrna_feature, interacts, mrna_feature)={ edge_index=[2, 293] },
  (mirna_feature, regulates, mrna_feature)={ edge_index=[2, 462] }
)

In [39]:
import torch
from src.models.birgat import BiRGAT

params = {
    "hidden_channels": [200, 32, 32, 32, 32],
    "heads": 2,
    "dropout": 0.2,
    "attention_dropout": 0.2,
    "use_proj_module": False,
    "integrator_type": "attention",
    "proj_dim" : 64,
    "three_layers": False
}

model = BiRGAT(
    omic_channels=data.omics,
    feature_names=data.feature_names,
    relations=list(data.edge_index_dict.keys()),
    input_dims={
        omic: data.x_dict[omic].shape[1] for omic in data.x_dict.keys()
    },
    proj_dim=params["proj_dim"],
    hidden_channels=params["hidden_channels"],
    num_classes=len(torch.unique(data.y)),
    heads=params["heads"],
    dropout=params["dropout"],
    attention_dropout=params["attention_dropout"],
    use_proj_module=params["use_proj_module"],
    integrator_type=params["integrator_type"],
    three_layers=params["three_layers"],
)
model.forward(data)

tensor([[-0.1996,  0.0200],
        [ 0.1476, -0.0125],
        [-0.0169, -0.3833],
        [-0.2606, -0.1123],
        [ 0.0988,  0.1072],
        [ 0.2152,  0.1267],
        [ 0.2539,  0.2399],
        [-0.1327,  0.1547],
        [ 0.1204,  0.0057],
        [ 0.0787,  0.0047],
        [-0.1699, -0.0654],
        [-0.1015, -0.0169],
        [-0.1148, -0.1564],
        [-0.2891, -0.1228],
        [-0.1221, -0.1194],
        [-0.2548, -0.1509],
        [-0.1392, -0.1450],
        [-0.0354, -0.2231],
        [-0.2383,  0.0189],
        [-0.0243, -0.2815],
        [-0.1663,  0.0082],
        [ 0.0280, -0.0584],
        [ 0.1171,  0.0122],
        [ 0.0266, -0.0320],
        [-0.1866,  0.0935],
        [-0.1981, -0.1071],
        [-0.1796, -0.3447],
        [-0.0147, -0.2409],
        [ 0.1196, -0.2709],
        [ 0.0811, -0.2482],
        [ 0.1268, -0.0737],
        [-0.0107, -0.0359],
        [-0.1744, -0.2606],
        [-0.2738, -0.0500],
        [-0.0365, -0.1182],
        [ 0.0636, -0

In [302]:
data.edge_index_dict

{('mrna_feature',
  'interacts',
  'mrna_feature'): tensor([[  0,   1,   2,   2,   3,   4,   4,   4,   5,   6,   7,   8,   9,  10,
           10,  10,  10,  10,  10,  11,  12,  12,  12,  12,  12,  13,  13,  14,
           15,  15,  16,  17,  17,  17,  17,  18,  19,  19,  19,  19,  20,  20,
           21,  21,  21,  21,  21,  21,  21,  22,  23,  23,  23,  24,  25,  26,
           27,  27,  28,  29,  30,  30,  31,  31,  32,  33,  33,  34,  35,  36,
           36,  36,  37,  38,  38,  38,  38,  38,  38,  38,  38,  38,  39,  40,
           40,  40,  40,  40,  40,  40,  40,  41,  41,  42,  43,  43,  43,  43,
           43,  44,  44,  45,  45,  46,  47,  48,  48,  48,  48,  49,  49,  49,
           49,  49,  50,  50,  50,  51,  51,  51,  52,  52,  52,  52,  52,  52,
           53,  53,  53,  53,  53,  53,  53,  53,  54,  55,  56,  57,  58,  59,
           59,  60,  61,  62,  62,  62,  62,  62,  63,  64,  65,  65,  65,  65,
           66,  67,  67,  67,  68,  69,  70,  71,  72,  73,  74,  74,

In [300]:
data["mrna_feature", "interacts", "mrna_feature"].edge_index

tensor([[  0,   1,   2,   2,   3,   4,   4,   4,   5,   6,   7,   8,   9,  10,
          10,  10,  10,  10,  10,  11,  12,  12,  12,  12,  12,  13,  13,  14,
          15,  15,  16,  17,  17,  17,  17,  18,  19,  19,  19,  19,  20,  20,
          21,  21,  21,  21,  21,  21,  21,  22,  23,  23,  23,  24,  25,  26,
          27,  27,  28,  29,  30,  30,  31,  31,  32,  33,  33,  34,  35,  36,
          36,  36,  37,  38,  38,  38,  38,  38,  38,  38,  38,  38,  39,  40,
          40,  40,  40,  40,  40,  40,  40,  41,  41,  42,  43,  43,  43,  43,
          43,  44,  44,  45,  45,  46,  47,  48,  48,  48,  48,  49,  49,  49,
          49,  49,  50,  50,  50,  51,  51,  51,  52,  52,  52,  52,  52,  52,
          53,  53,  53,  53,  53,  53,  53,  53,  54,  55,  56,  57,  58,  59,
          59,  60,  61,  62,  62,  62,  62,  62,  63,  64,  65,  65,  65,  65,
          66,  67,  67,  67,  68,  69,  70,  71,  72,  73,  74,  74,  75,  75,
          75,  75,  75,  76,  76,  76,  76,  77,  77

In [274]:
data.x_dict

{'mrna': tensor([[ 0.1901,  0.8589,  0.7247,  ...,  0.4696,  0.2355,  0.2009],
         [ 0.1317,  0.7882,  0.7385,  ...,  0.8404,  0.4612,  0.0611],
         [ 0.9406,  0.6232,  0.7357,  ...,  0.8423,  0.1146,  0.2261],
         ...,
         [ 0.5044,  0.5115,  0.6188,  ...,  0.3666,  0.1762,  0.7017],
         [-0.2887,  0.5810,  0.8522,  ...,  0.3074,  0.1788,  1.1057],
         [-0.0414,  0.5253,  0.5139,  ...,  0.1934,  0.9554,  0.1940]]),
 'mirna': tensor([[ 0.8021,  0.7240,  0.8759,  ...,  0.7245,  0.6704,  0.8779],
         [ 0.6343,  0.6804,  0.6110,  ...,  0.7883,  0.2520,  0.4453],
         [ 0.4353,  0.6257,  0.7742,  ...,  0.6928,  0.4783,  0.6300],
         ...,
         [ 0.6901,  0.5982,  0.8047,  ...,  0.9011,  0.4010,  0.6526],
         [ 0.7930,  0.8460,  0.6499,  ...,  0.8561,  0.7770,  0.6665],
         [ 0.9770, -0.0268,  0.6509,  ...,  0.8369,  0.5275,  0.9227]])}

In [289]:
mirna_mrna_interactions_db = pl.read_csv("interaction_data/mirna_mrna_interactions_DB.csv")
mmirnas = mirna_mrna_interactions_db["mirna"].to_list()
mirna_gene_names = ["".join(mirna.split("-")[1:3]).upper() for mirna in mmirnas]
mirna_mrna_interactions_db.with_columns(
    pl.Series("mirna", mirna_gene_names)
).select("mirna", "gene").write_csv("interaction_data/mirna_genes_mrna.csv")