# Testing single node datasets

Purpose: until now dataset represented by combination of tree types - now test for single node type

## Importing

In [1]:
import sys
import os

# Construct the path to the parent directory of the parent directory
parent_of_parent_dir = os.path.abspath(os.path.join(os.path.pardir, os.path.pardir))

# Add the parent directory of the parent directory to the system path
sys.path.insert(0, parent_of_parent_dir)

In [2]:
import optuna
import numpy as np
from sklearn.metrics import r2_score
import torch
from torch.utils.data import random_split, DataLoader

from etnn import TreeNode
from etnn.tools.training_tools import ConfigStore
from etnn.data.ferris_wheel import load_pure_ferris_wheel_dataset_single_node
from etnn.tools.training import train_epoch, eval_epoch
from etnn.nn.layer_framework import LayerManagementFramework
from etnn.tools.training_tools import seeding_all
from etnn.routines.run_config import choice_trainloader, choice_loss, choice_optim

## Control constants

In [3]:
dataset_path = "../../datasets/"
test_perc = 0.3
val_perc = 0.21
stability_count = 5
label = "tree-advanced" # although irrelevant

## Defining parameter search

In [4]:
def objective(trial):
    # init default config
    config = ConfigStore(
       in_dim=15,
       hidden_dim=trial.suggest_int("hidden_dim", 16, 512, step=16),
       out_dim=1,
       k=trial.suggest_int("k", 1, 5),
       dataset=-1 if normalized else 0,
       ds_size=10_000,
       num_gondolas=-1,
       num_part_pg=-1,
       loss_name='mse',
       optimizer_name='adam',
       num_max_epochs=30, # real: 100
       learning_rate=trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True),
       batch_size=1024,
       early_stop_tol=5,
       use_equal_batcher=trial.suggest_categorical("batcher", [True, False]),
       seed=420,
       label_type=label,
       final_label_factor=5/1000
    )
    # config = ConfigStore(
    #     in_dim=15,
    #     hidden_dim=16,
    #     out_dim=1,
    #     k=2,
    #     dataset=-1 if normalized else 0,
    #     ds_size=10_000,
    #     num_gondolas=-1,
    #     num_part_pg=-1,
    #     loss_name='mse',
    #     optimizer_name='adam',
    #     num_max_epochs=30, # real: 100
    #     learning_rate=0.002,
    #     batch_size=1024,
    #     early_stop_tol=5,
    #     use_equal_batcher=False,
    #     seed=420,
    #     label_type=label,
    #     final_label_factor=5/1000
    # )

    # loading dataset
    dataset, df_index = load_pure_ferris_wheel_dataset_single_node(
        node_type=node_type,
        num_elem=size_elem,
        num_to_generate=config.ds_size,
        dataset_path=dataset_path,
        final_label_factor=config.final_label_factor,
        normalize=True if config.dataset == -1 else 0
    )
    # splitting off test dataset
    generator = torch.Generator().manual_seed(config.seed)
    train_ds, val_ds, _ = random_split(
        dataset,
        [1 - test_perc - val_perc, val_perc, test_perc],
        generator=generator
    )

    # loaders
    train_loader = choice_trainloader(config, df_index, train_ds)
    val_loader = DataLoader(val_ds, batch_size=4 * config.batch_size, shuffle=False)

    # define device
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # set seed for reproducability
    seeding_all(config.seed)

    # define model
    model = LayerManagementFramework(
        in_dim=config.in_dim,
        tree=TreeNode(node_type, [TreeNode("E", size_elem)]),
        hidden_dim=config.hidden_dim,
        out_dim=config.out_dim,
        k=config.k
    ).to(device)

    # learning tools
    criterion = choice_loss(config)
    optimizer = choice_optim(config, model)

    # init score list
    score_list = []

    # train for specified number of epochs
    for epoch in range(config.num_max_epochs):
        _, _, _ = train_epoch(
            model,
            train_loader,
            optimizer,
            device,
            criterion
        )

        _, val_true_y, val_pred_y = eval_epoch(
            model,
            val_loader,
            device,
            criterion
        )

        # calc r2 score and append
        score = r2_score(y_true=val_true_y, y_pred=val_pred_y)
        score_list += [score]
        trial.report(score, epoch)

    # calculate objective
    # display(score_list)
    # idea: last x r2 scores (why not last one? for stability purposes)
    obj = np.array(score_list)[-stability_count:]
    return np.mean(obj)

## Parameter search

### S

In [10]:
node_type = "S"
size_elem = 10
n_trials = 1
normalized = False
study = optuna.create_study(study_name=f"Study node type {node_type}, normalized: {normalized}", directions=['maximize'])
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

[I 2023-12-10 16:22:32,155] A new study created in memory with name: Study node type S, normalized: True


  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-10 16:23:34,763] Trial 0 finished with value: 0.7580406634944465 and parameters: {}. Best is trial 0 with value: 0.7580406634944465.


In [None]:
study.trials_dataframe().to_csv(f"single-node_t-{node_type}_n-{size_elem}_normalized-{normalized}")

In [13]:
node_type = "S"
size_elem = 10
n_trials = 1
normalized = True
study = optuna.create_study(study_name=f"Study node type {node_type}, normalized: {normalized}", directions=['maximize'])
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

[I 2023-12-10 16:38:37,643] A new study created in memory with name: Study node type S, normalized: True


  0%|          | 0/1 [00:00<?, ?it/s]


100%|██████████| 5000/5000 [00:00<00:00, 166665.50it/s]

  0%|          | 0/5000 [00:00<?, ?it/s][A
  0%|          | 10/5000 [00:00<00:50, 98.88it/s][A
  0%|          | 20/5000 [00:00<00:52, 94.76it/s][A
  1%|          | 30/5000 [00:01<03:30, 23.66it/s][A
  2%|▏         | 95/5000 [00:01<00:45, 107.36it/s][A
  3%|▎         | 159/5000 [00:01<00:25, 192.45it/s][A
  4%|▍         | 225/5000 [00:01<00:17, 279.11it/s][A
  6%|▌         | 294/5000 [00:01<00:12, 364.55it/s][A
  7%|▋         | 361/5000 [00:01<00:10, 432.81it/s][A
  9%|▊         | 427/5000 [00:01<00:09, 487.98it/s][A
 10%|▉         | 491/5000 [00:01<00:08, 526.46it/s][A
 11%|█         | 561/5000 [00:01<00:07, 573.03it/s][A
 13%|█▎        | 630/5000 [00:01<00:07, 605.30it/s][A
 14%|█▍        | 698/5000 [00:02<00:06, 626.46it/s][A
 15%|█▌        | 765/5000 [00:02<00:06, 634.06it/s][A
 17%|█▋        | 832/5000 [00:02<00:06, 640.85it/s][A
 18%|█▊        | 900/5000 [00:02<00:06, 648.56it/s][A
 19%|█▉        | 968/5000

[I 2023-12-10 16:39:19,281] Trial 0 finished with value: 0.6851436042564666 and parameters: {}. Best is trial 0 with value: 0.6851436042564666.


In [None]:
study.trials_dataframe().to_csv(f"single-node_t-{node_type}_n-{size_elem}_normalized-{normalized}")

### Q

In [10]:
node_type = "Q"
size_elem = 10
n_trials = 1
normalized = False
study = optuna.create_study(study_name=f"Study node type {node_type}, normalized: {normalized}", directions=['maximize'])
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

[I 2023-12-10 16:22:32,155] A new study created in memory with name: Study node type S, normalized: True


  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-10 16:23:34,763] Trial 0 finished with value: 0.7580406634944465 and parameters: {}. Best is trial 0 with value: 0.7580406634944465.


In [None]:
study.trials_dataframe().to_csv(f"single-node_t-{node_type}_n-{size_elem}_normalized-{normalized}")

In [5]:
node_type = "Q"
size_elem = 10
n_trials = 1
normalized = True
study = optuna.create_study(study_name=f"Study node type {node_type}, normalized: {normalized}", directions=['maximize'])
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

[I 2023-12-10 16:44:51,891] A new study created in memory with name: Study node type Q, normalized: True


  0%|          | 0/1 [00:00<?, ?it/s]


100%|██████████| 10000/10000 [00:00<00:00, 140849.13it/s]

  0%|          | 0/10000 [00:00<?, ?it/s][A
  0%|          | 12/10000 [00:00<01:31, 109.09it/s][A
  0%|          | 23/10000 [00:00<04:00, 41.45it/s] [A
  1%|          | 59/10000 [00:00<01:25, 115.68it/s][A
  1%|▏         | 127/10000 [00:00<00:38, 255.48it/s][A
  2%|▏         | 195/10000 [00:00<00:26, 366.37it/s][A
  3%|▎         | 264/10000 [00:00<00:21, 454.38it/s][A
  3%|▎         | 333/10000 [00:01<00:18, 517.45it/s][A
  4%|▍         | 402/10000 [00:01<00:16, 566.21it/s][A
  5%|▍         | 470/10000 [00:01<00:15, 599.04it/s][A
  5%|▌         | 537/10000 [00:01<00:15, 617.97it/s][A
  6%|▌         | 604/10000 [00:01<00:14, 630.77it/s][A
  7%|▋         | 672/10000 [00:01<00:14, 643.02it/s][A
  7%|▋         | 739/10000 [00:01<00:14, 650.55it/s][A
  8%|▊         | 806/10000 [00:01<00:14, 656.32it/s][A
  9%|▊         | 873/10000 [00:01<00:13, 656.49it/s][A
  9%|▉         | 941/10000 [00:01<00:13, 659.55it/s][A
 1

[W 2023-12-10 16:45:10,966] Trial 0 failed with parameters: {'hidden_dim': 288, 'k': 5, 'learning_rate': 0.0010602539173998138, 'batcher': False} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\johan\anaconda3\envs\P2\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\johan\AppData\Local\Temp\ipykernel_31488\1852015164.py", line 89, in objective
    _, _, _ = train_epoch(
  File "D:\DATEN\P2_EquivariantTreeNN\etnn\tools\training.py", line 47, in train_epoch
    prediction = model(batch_data).flatten()
  File "C:\Users\johan\anaconda3\envs\P2\lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "C:\Users\johan\anaconda3\envs\P2\lib\site-packages\torch\nn\modules\module.py", line 1527, in _call_impl
    return forward_call(*args, **kwargs)
  File "D:\DATEN\P2_EquivariantTreeNN\etnn\nn\l

KeyboardInterrupt: 

In [None]:
study.trials_dataframe().to_csv(f"single-node_t-{node_type}_n-{size_elem}_normalized-{normalized}")

### C

In [10]:
node_type = "C"
size_elem = 10
n_trials = 1
normalized = False
study = optuna.create_study(study_name=f"Study node type {node_type}, normalized: {normalized}", directions=['maximize'])
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

[I 2023-12-10 16:22:32,155] A new study created in memory with name: Study node type S, normalized: True


  0%|          | 0/1 [00:00<?, ?it/s]

[I 2023-12-10 16:23:34,763] Trial 0 finished with value: 0.7580406634944465 and parameters: {}. Best is trial 0 with value: 0.7580406634944465.


In [None]:
study.trials_dataframe().to_csv(f"single-node_t-{node_type}_n-{size_elem}_normalized-{normalized}")

In [6]:
node_type = "C"
size_elem = 10
n_trials = 1
normalized = True
study = optuna.create_study(study_name=f"Study node type {node_type}, normalized: {normalized}", directions=['maximize'])
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

[I 2023-12-10 16:45:23,908] A new study created in memory with name: Study node type C, normalized: True


  0%|          | 0/1 [00:00<?, ?it/s]


100%|██████████| 10000/10000 [00:00<00:00, 185186.34it/s]

  0%|          | 0/10000 [00:00<?, ?it/s][A
  0%|          | 9/10000 [00:00<02:01, 82.56it/s][A
  0%|          | 21/10000 [00:00<05:59, 27.78it/s][A
  1%|          | 51/10000 [00:00<02:10, 76.16it/s][A
  1%|          | 113/10000 [00:00<00:53, 185.99it/s][A
  2%|▏         | 182/10000 [00:00<00:32, 298.30it/s][A
  2%|▏         | 249/10000 [00:01<00:25, 388.10it/s][A
  3%|▎         | 316/10000 [00:01<00:21, 460.52it/s][A
  4%|▍         | 382/10000 [00:01<00:18, 512.70it/s][A
  4%|▍         | 448/10000 [00:01<00:17, 552.51it/s][A
  5%|▌         | 514/10000 [00:01<00:16, 580.18it/s][A
  6%|▌         | 579/10000 [00:01<00:15, 597.82it/s][A
  6%|▋         | 643/10000 [00:01<00:15, 608.79it/s][A
  7%|▋         | 708/10000 [00:01<00:14, 620.30it/s][A
  8%|▊         | 772/10000 [00:01<00:14, 624.65it/s][A
  8%|▊         | 839/10000 [00:02<00:14, 635.45it/s][A
  9%|▉         | 905/10000 [00:02<00:14, 642.24it/s][A
 10%|▉

[W 2023-12-10 16:45:42,441] Trial 0 failed with parameters: {'hidden_dim': 64, 'k': 5, 'learning_rate': 0.005149163663500805, 'batcher': True} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\johan\anaconda3\envs\P2\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\johan\AppData\Local\Temp\ipykernel_31488\1852015164.py", line 97, in objective
    _, val_true_y, val_pred_y = eval_epoch(
  File "D:\DATEN\P2_EquivariantTreeNN\etnn\tools\training.py", line 106, in eval_epoch
    for batch_data, batch_label in eval_loader:
  File "C:\Users\johan\anaconda3\envs\P2\lib\site-packages\torch\utils\data\dataloader.py", line 630, in __next__
    data = self._next_data()
  File "C:\Users\johan\anaconda3\envs\P2\lib\site-packages\torch\utils\data\dataloader.py", line 674, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\Users\

KeyboardInterrupt: 

In [None]:
study.trials_dataframe().to_csv(f"single-node_t-{node_type}_n-{size_elem}_normalized-{normalized}")