# Evaluation of other results

This notebook has been written to run on the TU Ilmenau cluster with my specific setup.
However, it can also be used to simply visualize the effect of changing the 'spectra_dim'
of the models and to investigate whether adding an 'MLP+AddNorm' block to each message passing layer improves performance.

A summary of all results is also stored in the directory `/research/other_data`.

In [None]:
from __future__ import annotations

import os
import re
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorboard.backend.event_processing import event_accumulator

import optimetal.utils as utils
import optimetal.factory as factory
from optimetal.data.loader import load_torch_data
utils.load_plot_style()

def load_tb_scalars(logdir: str) -> dict:
    """
    Load scalar values from tensorboard event files. This is useful
    when you want to look at training and validation loss curves.
    """
    ea = event_accumulator.EventAccumulator(
        logdir,
        size_guidance={event_accumulator.SCALARS: 0},
    )
    ea.Reload()
    tags = ea.Tags().get("scalars", [])
    tb_log = {}
    for tag in tags:
        events = ea.Scalars(tag)
        values = [e.value for e in events]
        tb_log[tag] = values
    return tb_log

def load_results(study_path: str, mode: str) -> dict:
    """
    Load all of results from the transformer hyperparameter scaling law study.
    Input:
        study_path:     Path to the root directory containing subdirectories from model training
        mode:           Supported modes are 'spectra_dim' and 'mlp_addnorm'
    Output:
        scaling_laws:   Nested dict mapping, model type, scaling type, and hyperparameters to dictionaries with "val_loss"
    """

    # gather all study directories
    study_dirs = [d for d in os.listdir(study_path) if os.path.isdir(os.path.join(study_path, d))]
    # initialize nested structure for results
    results = {}
    # iterate through each study directory and load results
    print(f"Loading scaling law results for {len(study_dirs):d} models")
    for study_dir in study_dirs:
        # path setup and checks
        study_dir_path = os.path.join(study_path, study_dir)
        val_loss_path = os.path.join(study_dir_path, "val_loss.txt")
        best_model_path = os.path.join(study_dir_path, "best_model.pt")
        if not os.path.exists(val_loss_path) or not os.path.exists(best_model_path):
            print(f"Skipping {study_dir_path:s}, probably still running")
            continue
        # get the number of model parameters
        best_model_dict = load_torch_data(best_model_path)
        config_dict = best_model_dict["config_dict"]
        model_config = config_dict.architecture
        model = factory.create_model(model_config)
        num_parameter = utils.get_model_parameters(model)
        # parse rng seed
        seed = re.search(r"seed(\d+)", study_dir).group(1)
        # load the data from the tensorboard log and validation loss file
        best_val_loss = float(np.loadtxt(val_loss_path))
        tb_log = load_tb_scalars(study_dir_path)
        val_loss = tb_log.get("val/loss", [])
        min_idx = np.argmin(val_loss)
        best_eps_loss = tb_log.get("val/eps", [])[min_idx]
        best_drude_loss = tb_log.get("val/drude", [])[min_idx]
        result_entry = {
            "seed": seed,
            "num_parameter": num_parameter,
            "lr": config_dict.optimizer["lr"],
            "val_loss": best_val_loss,
            "eps_loss": best_eps_loss,
            "drude_loss": best_drude_loss,
        }
        # insert the data into the nested structure
        if mode == "spectra_dim":
            width = re.search(r"hidden(\d+)", study_dir).group(1)
            spectra_multiplier = re.search(r"spectra_multiplier(\d+\.?\d*)", study_dir).group(1)
            results.setdefault(width, {}).setdefault(spectra_multiplier, []).append(result_entry)
        elif mode == "mlp_addnorm":
            residual = "no_residual" if "no_residual" in study_dir else "residual"
            num_mp_layer = re.search(r"mp(\d+\.?\d*)", study_dir).group(1)
            results.setdefault(residual, {}).setdefault(num_mp_layer, []).append(result_entry)
        else:
            raise ValueError(f"Unsupported mode: {mode:s}")
    return results

def fmt_param(v: float, decimals_if_needed: int = 1) -> str:
    """
    Parameter count formatter function.
    """
    if not np.isfinite(v):
        return "---"
    v = float(v)
    if v >= 1_000_000:
        val, unit = v / 1_000_000.0, "M"
    elif v >= 1_000:
        val, unit = v / 1_000.0, "k"
    else:
        return f"{int(round(v)):d}"
    s = f"{val:.{decimals_if_needed:d}f}"
    return f"{s:s}{unit:s}"

def fmt_loss(m: float, s: float) -> str:
    """
    Formatter function for loss with standard deviation.
    """
    return rf"${m:.3f} \pm {s:.3f}$" if np.isfinite(m) and np.isfinite(s) else "---"

def power_law_with_floor(x: float, alpha: float, x0: float, l_0: float) -> float:
    """
    Parameter scaling power law function.
    """
    return l_0 + (x0 / x) ** alpha

# Influence of the 'MLP+AddNorm' block on the validation loss

In [None]:
# directory containing the scaling law study
study_path = "/scratch/magr4985/MLP_AddNorm"

# directory to save the results
output_dir = "./other_data"
os.makedirs(output_dir, exist_ok=True)

# check if the study path exists, else just load in the results already stored in JSON files
json_path = os.path.join(output_dir, "mlp_addnorm_results.json")
if os.path.exists(study_path) and not os.path.exists(json_path):
    print(f"Study path {study_path:s} exists, loading results from there")
    results = load_results(study_path, mode="mlp_addnorm")
    with open(json_path, "w") as f:
        json.dump(results, f, indent=4)
else:
    print(f"Loading results from JSON file")
    with open(json_path, "r") as f:
        results = json.load(f)
        
# directory where to store the figures and tables
fig_dir = "./other_data/results"
os.makedirs(fig_dir, exist_ok=True)

In [None]:
# prepare the data for the model trained without residual MLP+AddNorm
data = results["no_residual"]
sort_idx = np.argsort([int(k) for k in data.keys()])
keys_sorted = np.array(list(data.keys()))[sort_idx]
x = np.array([int(k) for k in keys_sorted], dtype=float)
num_parameter_no_residual = np.array([data[k][0]["num_parameter"] for k in keys_sorted], dtype=float)
y_mean_no_residual = np.array([np.mean([entry["val_loss"] for entry in data[k]]) for k in keys_sorted], dtype=float)
y_std_no_residual = np.array([np.std([entry["val_loss"]  for entry in data[k]]) for k in keys_sorted], dtype=float)

# prepare the data for the model trained with residual MLP+AddNorm
data = results["residual"]
sort_idx = np.argsort([int(k) for k in data.keys()])
keys_sorted = np.array(list(data.keys()))[sort_idx]
x = np.array([int(k) for k in keys_sorted], dtype=float)
num_parameter = np.array([data[k][0]["num_parameter"] for k in keys_sorted], dtype=float)
y_mean = np.array([np.mean([entry["val_loss"] for entry in data[k]]) for k in keys_sorted], dtype=float)
y_std = np.array([np.std([entry["val_loss"]  for entry in data[k]]) for k in keys_sorted], dtype=float)

# make a latex table
col_labels = [f"{int(v)}" for v in x]
params_no_residual = [fmt_param(v) for v in num_parameter_no_residual]
losses_no_residual = [f"{v:.3f}" for v in y_mean_no_residual]
stds_no_residual = [f"{v:.3f}" for v in y_std_no_residual]
params = [fmt_param(v) for v in num_parameter]
losses = [f"{v:.3f}" for v in y_mean]
stds = [f"{v:.3f}" for v in y_std]
row_index = pd.MultiIndex.from_product(
    [["MP-only", r"MP with MLP+\textsc{Add\&Norm}"], [r"$N$", r"$L_\mathrm{val}$", r"$\sigma(L_\mathrm{val})$"]],
    names=["architecture", "metric"],
)
col_index = pd.MultiIndex.from_arrays(
    [[r"$N_\mathrm{MP}$"] * len(col_labels), col_labels]
)
df = pd.DataFrame(
    [params_no_residual, losses_no_residual, stds_no_residual, params, losses, stds],
    index=row_index,
    columns=col_index,
)
latex = df.to_latex(
    escape=False,
    index_names=False,
    multirow=True,
    multicolumn=True,
    multicolumn_format="c",
    column_format=r"c@{\hspace{1em}}l@{\hspace{1em}}" + r"c@{\hspace{1em}}"*len(col_labels),
)
latex = re.sub(r"\\cline{1-10}", r"\\midrule", latex)
print("MLP+AddNorm Table:")
print(latex)
with open(os.path.join(fig_dir, "mlp_addnorm.txt"), "w") as f:
    f.write(latex)

# plot the data
fig, ax = plt.subplots(figsize=(3, 3))
ax.errorbar(
    x=x,
    y=y_mean_no_residual,
    yerr=y_std_no_residual,
    fmt="o",
    markersize=4,
    markeredgecolor="tab:orange",
    markerfacecolor="tab:orange",
    ecolor="tab:orange",
    capsize=4,
    linestyle="none",
)
ax.plot(x, y_mean_no_residual, "-", color="tab:orange", label="MP-only")
ax.errorbar(
    x=x,
    y=y_mean,
    yerr=y_std,
    fmt="o",
    markersize=4,
    markeredgecolor="tab:blue",
    markerfacecolor="tab:blue",
    ecolor="tab:blue",
    capsize=4,
    linestyle="none",
)
ax.plot(x, y_mean, "-", color="tab:blue", label=r"MP with MLP+\textsc{Add\&Norm}")
ax.set_xticks(list(range(1,9)))
ax.set_ylim([1.09, 1.54])
ax.set_xlabel(r"$N_\mathrm{MP}$")
ax.set_ylabel(r"$L_\mathrm{val}$")
ax.legend(handlelength=1.25)
fig.tight_layout()
fig.savefig(os.path.join(fig_dir, "mlp_addnorm.pdf"))

# quanitfy the improvement
print(f"Mean improvement   = {100 * np.mean(1 - (y_mean / y_mean_no_residual)):.2f}%")
print(f"Median improvement = {100 * np.median(1 - (y_mean / y_mean_no_residual)):.2f}%")

# Influence of the 'spectra_dim' on the validation loss

In [None]:
# directory containing the scaling law study
study_path = "/scratch/magr4985/Spectra_Dim"

# directory to save the results
output_dir = "./other_data"
os.makedirs(output_dir, exist_ok=True)

# check if the study path exists, else just load in the results already stored in JSON files
json_path = os.path.join(output_dir, "spectra_dim_results.json")
if os.path.exists(study_path) and not os.path.exists(json_path):
    print(f"Study path {study_path:s} exists, loading results from there")
    results = load_results(study_path, mode="spectra_dim")
    with open(json_path, "w") as f:
        json.dump(results, f, indent=4)
else:
    print(f"Loading results from JSON file")
    with open(json_path, "r") as f:
        results = json.load(f)
        
# directory where to store the figures and tables
fig_dir = "./other_data/results"
os.makedirs(fig_dir, exist_ok=True)

In [None]:
# prepare the data
data = results["256"] # used a fixed hidden dimension
sort_idx = np.argsort([int(k) for k in data.keys()])
keys_sorted = np.array(list(data.keys()))[sort_idx]
x = np.array([int(k) for k in keys_sorted], dtype=float)
num_parameter = np.array([data[k][0]["num_parameter"] for k in keys_sorted], dtype=float)
y_mean = np.array([np.mean([entry["val_loss"] for entry in data[k]]) for k in keys_sorted], dtype=float)
y_std = np.array([np.std([entry["val_loss"]  for entry in data[k]]) for k in keys_sorted], dtype=float)

# make a latex table
col_labels = [f"{int(v)}" for v in x]
params = [fmt_param(v) for v in num_parameter]
losses = [f"{v:.3f}" for v in y_mean]
stds = [f"{v:.3f}" for v in y_std] 
df = pd.DataFrame(
    [params, losses, stds],
    index=[r"$N$", r"$L_\mathrm{val}$", r"$\sigma(L_\mathrm{val})$"],
    columns=pd.MultiIndex.from_arrays([[r"$m_\mathrm{spectra}$"]*len(col_labels), col_labels])
)
latex = df.to_latex(multicolumn=True, multicolumn_format="c", column_format=r"l@{\hspace{1em}}" + r"c@{\hspace{1em}}"*len(col_labels))
print("Spectra Dimension Table:")
print(latex)
with open(os.path.join(fig_dir, "spectra_dim.txt"), "w") as f:
    f.write(latex)

# parameter scaling law parameters from "scaling_results.ipynb"
alpha = 0.58
x0 = 10**4.33
l_0 = 1.03

# plot the data
fig, ax = plt.subplots(figsize=(3, 3))
plt.plot(
    [x[0], x[-1]], 
    [
        power_law_with_floor(num_parameter[0], alpha=alpha, x0=x0, l_0=l_0),
        power_law_with_floor(num_parameter[-1], alpha=alpha, x0=x0, l_0=l_0),
    ],
    "-",
    color="tab:orange",
    label=r"$L(N)$",
)
ax.errorbar(
    x=x,
    y=y_mean,
    yerr=y_std,
    fmt="o",
    markersize=4,
    markeredgecolor="tab:blue",
    markerfacecolor="tab:blue",
    ecolor="tab:blue",
    capsize=4,
    linestyle="none",
    label=r"Data"
)
ax.plot(x, y_mean, "-", color="tab:blue")
ax.set_xticks(list(range(1,9)))
ax.set_xlabel(r"$m_\mathrm{Spectra}$")
ax.set_ylabel(r"$L_\mathrm{val}$")
ax.legend(handlelength=1.25)
fig.tight_layout()
fig.savefig(os.path.join(fig_dir, "spectra_dim.pdf"))