# Full test ETNN configs determined by optuna

## Importing section

In [None]:
import os
import sys

from sklearn.metrics import r2_score
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm

from etnn.nn.baseline import create_baseline_model, calc_params
from etnn.tools.training import train_epoch, eval_epoch

sys.path.insert(0, os.path.pardir)

In [1]:
import pandas as pd
import torch

from etnn.routines.run_config import run_config, choice_optim, choice_loss, choice_trainloader, acquire_config_idx, \
    choice_dataset

import matplotlib.pyplot as plt

from etnn.tools.training_tools import ConfigStore, seeding_all, EpochControl, AccuracyManager

## Definitions section

In [None]:
def run_baseline_custom(
        model,
        config,
        device: str = "cuda" if torch.cuda.is_available() else 'cpu',
        dataset_path: str = "../../datasets"
):
    # definition of constants
    test_perc = 0.3

    # CHOICES FOR DATASET
    dataset, df_index = choice_dataset(config, dataset_path)

    # SPLITTING DATASET IN TRAIN AND VAL
    generator = torch.Generator().manual_seed(config.seed)
    train_ds, test_ds = random_split(
        dataset,
        [1 - test_perc, test_perc],
        generator=generator
    )

    # ESTABLISHMENT OF LOADERS
    train_loader = choice_trainloader(config, df_index, train_ds)

    test_loader = DataLoader(test_ds, batch_size=4 * config.batch_size, shuffle=False)

    # DEFINE LOSS AND OPTIMIZER
    criterion = choice_loss(config)

    # set seeds
    seeding_all(config.seed)

    # move model to device
    model = model.to(device)

    optimizer = choice_optim(config, model)

    # init storage containers
    train_loss = []
    test_loss = []
    train_r2 = []
    test_r2 = []

    # train for N epochs
    for _ in tqdm(range(config.num_max_epochs)):
        train_mean_loss, train_true_y, train_pred_y = train_epoch(
            model,
            train_loader,
            optimizer,
            device,
            criterion
        )

        test_mean_loss, test_true_y, test_pred_y = eval_epoch(
            model,
            test_loader,
            device,
            criterion
        )

        train_loss += [train_mean_loss]
        test_loss += [test_mean_loss]
        train_r2 += [r2_score(y_true=train_true_y, y_pred=train_pred_y)]
        test_r2 += [r2_score(y_true=test_true_y, y_pred=test_pred_y)]

    # fuze dataset and save
    df = pd.DataFrame({
        "epoch": list(range(1, config.num_max_epochs + 1)),
        "train_loss": train_loss,
        "test_loss": test_loss,
        "train_r2": train_r2,
        "test_r2": test_r2
    })

    return df

In [None]:
def create_plot(
        df,
        what: str = "r2"
) -> None:
    plt.figure()
    # load data
    df_measures = df

    # Plot the initial data
    for config_id in df_measures.config_id.unique():
        # make subdataset for config id
        sub_df = df_measures[df_measures.config_id == config_id]

        # plot train test and val
        for mode in ['train', 'test', 'val']:
            # plot training
            plt.plot(sub_df.epoch, sub_df[f"{mode}_{what}"], label=f"{config_id}_{mode}-{what}")

    plt.title(f"{what} plot")
    plt.xlabel("epoch")
    plt.ylabel("score")
    if what == "r2":
        plt.ylim(-1, +1)
    plt.legend()
    plt.show()

## Tree advanced label

In [None]:
# define label to show
label = "tree_advanced"

# load config storage
config_table = pd.read_csv("study_label-tree-advanced_baseline.csv")

# sort config storage
config_table = config_table.sort_values(by=['value'], ascending=False)

# iterate over top 3 best configs
for i in range(3):
    # get entry from dataframe
    entry = config_table.iloc[i]

    # print information to console
    print(f"Rank {i+1} parameter configuration: {entry}")

    # setup config
    config = ConfigStore(
        in_dim=15,
        hidden_dim=int(entry.params_hidden_dim),
        out_dim=1,
        k=int(entry.params_k),
        dataset=0,
        ds_size=10_000,
        num_gondolas=10,
        num_part_pg=5,
        loss_name='mse',
        optimizer_name='adam',
        num_max_epochs=300,
        learning_rate=float(entry.params_learning_rate),
        batch_size=1024,
        early_stop_tol=5,
        use_equal_batcher=bool(entry.params_batcher),
        seed=420,
        label_type=label,
        final_label_factor=1/1000
    )

    # build model
    # define model
    layer_list = [torch.nn.Flatten()]
    features = config.in_dim * config.num_gondolas * config.num_part_pg

    # for each layer create a linear layer and relu (except last one)
    for i in range(int(entry.params_n_layers)-1):
        # determine new feature dimension
        new_features = int(entry[f"params_n_dim_{i}"])

        # add layer and relu to list
        layer_list += [torch.nn.Linear(features, new_features), torch.nn.ReLU()]

        # set the new feature to be the current feature
        features = new_features

    # set the last layer - this one must map to the out dimension
    layer_list += [torch.nn.Linear(features, config.out_dim)]
    model = torch.nn.Sequential(*layer_list)

    # run config and retrieve measurements
    df = run_baseline_custom(
        model=model,
        config=config,
    )

    # save measurements
    df.to_csv(f"baseline_label-{label}_rank-{i}.csv")

    # plot results
    create_plot(df, what="r2")
    create_plot(df, what="loss")

## Tree label

In [None]:
# define label to show
label = "tree"

# load config storage
config_table = pd.read_csv("study_label-tree_baseline.csv")

# sort config storage
config_table = config_table.sort_values(by=['value'], ascending=False)

# iterate over top 3 best configs
for i in range(3):
    # get entry from dataframe
    entry = config_table.iloc[i]

    # print information to console
    print(f"Rank {i+1} parameter configuration: {entry}")

    # setup config
    config = ConfigStore(
        in_dim=15,
        hidden_dim=int(entry.params_hidden_dim),
        out_dim=1,
        k=int(entry.params_k),
        dataset=0,
        ds_size=10_000,
        num_gondolas=10,
        num_part_pg=5,
        loss_name='mse',
        optimizer_name='adam',
        num_max_epochs=300,
        learning_rate=float(entry.params_learning_rate),
        batch_size=1024,
        early_stop_tol=5,
        use_equal_batcher=bool(entry.params_batcher),
        seed=420,
        label_type=label,
        final_label_factor=1/1000
    )

    # build model
    # define model
    layer_list = [torch.nn.Flatten()]
    features = config.in_dim * config.num_gondolas * config.num_part_pg

    # for each layer create a linear layer and relu (except last one)
    for i in range(int(entry.params_n_layers)-1):
        # determine new feature dimension
        new_features = int(entry[f"params_n_dim_{i}"])

        # add layer and relu to list
        layer_list += [torch.nn.Linear(features, new_features), torch.nn.ReLU()]

        # set the new feature to be the current feature
        features = new_features

    # set the last layer - this one must map to the out dimension
    layer_list += [torch.nn.Linear(features, config.out_dim)]
    model = torch.nn.Sequential(*layer_list)

    # run config and retrieve measurements
    df = run_baseline_custom(
        model=model,
        config=config,
    )

    # save measurements
    df.to_csv(f"baseline_label-{label}_rank-{i}.csv")

    # plot results
    create_plot(df, what="r2")
    create_plot(df, what="loss")

## Default label

In [None]:
# define label to show
label = "default"

# load config storage
config_table = pd.read_csv("study_label-default_baseline.csv")

# sort config storage
config_table = config_table.sort_values(by=['value'], ascending=False)

# iterate over top 3 best configs
for i in range(3):
    # get entry from dataframe
    entry = config_table.iloc[i]

    # print information to console
    print(f"Rank {i+1} parameter configuration: {entry}")

    # setup config
    config = ConfigStore(
        in_dim=15,
        hidden_dim=int(entry.params_hidden_dim),
        out_dim=1,
        k=int(entry.params_k),
        dataset=0,
        ds_size=10_000,
        num_gondolas=10,
        num_part_pg=5,
        loss_name='mse',
        optimizer_name='adam',
        num_max_epochs=300,
        learning_rate=float(entry.params_learning_rate),
        batch_size=1024,
        early_stop_tol=5,
        use_equal_batcher=bool(entry.params_batcher),
        seed=420,
        label_type=label,
        final_label_factor=1/1000
    )

    # build model
    # define model
    layer_list = [torch.nn.Flatten()]
    features = config.in_dim * config.num_gondolas * config.num_part_pg

    # for each layer create a linear layer and relu (except last one)
    for i in range(int(entry.params_n_layers)-1):
        # determine new feature dimension
        new_features = int(entry[f"params_n_dim_{i}"])

        # add layer and relu to list
        layer_list += [torch.nn.Linear(features, new_features), torch.nn.ReLU()]

        # set the new feature to be the current feature
        features = new_features

    # set the last layer - this one must map to the out dimension
    layer_list += [torch.nn.Linear(features, config.out_dim)]
    model = torch.nn.Sequential(*layer_list)

    # run config and retrieve measurements
    df = run_baseline_custom(
        model=model,
        config=config,
    )

    # save measurements
    df.to_csv(f"baseline_label-{label}_rank-{i}.csv")

    # plot results
    create_plot(df, what="r2")
    create_plot(df, what="loss")