# Load libraries

In [1]:
# load pandas and numpy
import pandas as pd
import numpy as np
import re
import os
import sys
from typing import Sequence, Dict

# Parser function

In [2]:
# function to load final model results: averaged across random seeds / folds
def avg_results(model_names: str, 
                model_names_with_covs: str = None,
                time_steps: int = 12)->Sequence[Dict[str, Dict[str, np.array]]]:
    """
    Parameters
    ----------
    model_names: str
        path to models' results file
    model_name_with_covs: str
        path to models' results file with covariates
    time_steps: int
        number of time steps that were predicted
    NOTE: model_names and model_names_with_covs should be in the same order

    Output
    ------
    Computes the following set of dictionaries:
    dict1:
        Dictionary of MSE / MAE values for ID / OOD sets with and without covariates
        key1: id / od, key2: covs / no_covs
    dict2:
        Dictionary of likelihood / calibration values for ID / OOD sets with and without covariates
        key1: id / od, key2: covs / no_covs
    """
    def parser(model_names):
        arr_id_errors = np.full((len(model_names), 2), np.nan)
        arr_ood_errors = arr_id_errors.copy()
        arr_id_likelihoods = arr_id_errors.copy()
        arr_ood_likelihoods = arr_id_errors.copy()
        for model_name in model_names:
            if not os.path.isfile(model_name):
                    continue
            with open(model_name, 'r') as f:
                for line in f:
                    if line.startswith('ID median of (MSE, MAE):'):
                        id_mse_mae = re.findall(r'\d+\.\d+', line)
                        arr_id_errors[model_names.index(model_name), 0] = np.sqrt(float(id_mse_mae[0]))
                        arr_id_errors[model_names.index(model_name), 1] = float(id_mse_mae[1])
                    elif line.startswith('OOD median of (MSE, MAE):'):
                        ood_mse_mae = re.findall(r'\d+\.\d+', line)
                        arr_ood_errors[model_names.index(model_name), 0] = np.sqrt(float(ood_mse_mae[0]))
                        arr_ood_errors[model_names.index(model_name), 1] = float(ood_mse_mae[1])
                    elif line.startswith('ID likelihoods:'):
                        id_likelihoods = re.findall(r'-?\d+\.\d+', line)
                        arr_id_likelihoods[model_names.index(model_name), 0] = float(id_likelihoods[0])
                    elif line.startswith('OOD likelihoods:'):
                        ood_likelihoods = re.findall(r'-?\d+\.\d+', line)
                        arr_ood_likelihoods[model_names.index(model_name), 0] = float(ood_likelihoods[0])
                    elif line.startswith('ID calibration errors:'):
                        id_calib = re.findall(r'-?\d+\.\d+', line)
                        id_calib = np.mean([float(x) for x in id_calib[:time_steps]])
                        arr_id_likelihoods[model_names.index(model_name), 1] = id_calib
                    elif line.startswith('OOD calibration errors:'):
                        ood_calib = re.findall(r'-?\d+\.\d+', line)
                        ood_calib = np.mean([float(x) for x in ood_calib[:time_steps]])
                        arr_ood_likelihoods[model_names.index(model_name), 1] = ood_calib
        return arr_id_errors, arr_ood_errors, arr_id_likelihoods, arr_ood_likelihoods
    dict_errors = {}
    dict_likelihoods = {}
    
    results_no_covs = parser(model_names)
    dict_errors['id'] = {'no_covs': results_no_covs[0]}
    dict_errors['ood'] = {'no_covs': results_no_covs[1]}
    dict_likelihoods['id'] = {'no_covs': results_no_covs[2]}
    dict_likelihoods['ood'] = {'no_covs': results_no_covs[3]}

    if model_names_with_covs is not None:
        results_covs = parser(model_names_with_covs)
        dict_errors['id']['covs'] = results_covs[0]
        dict_errors['ood']['covs'] = results_covs[1]
        dict_likelihoods['id']['covs'] = results_covs[2]
        dict_likelihoods['ood']['covs'] = results_covs[3]
    
    return dict_errors, dict_likelihoods

# Table 1: accuracy

In [7]:
datasets = ['iglu', 'colas', 'dubosson', 'hall', 'weinstock'] # iglu is Broll in the paper, otherwise alphabetical order
models = ['arima', 'linreg', 'xgboost', 'gluformer', 'latentode',  'nhits', 'tft', 'transformer']

def color(x):
    return r'\multicolumn{2}{c}{\textcolor{red}{+' + str(round(x, 2)) + '\%}}' if x > 0 else r' \multicolumn{2}{c}{\textcolor{blue}{' + str(round(x, 2)) + '\%}}'

for model in models:
    if model in ['arima', 'gluformer', 'latentode']: # no covariates
        print(model)
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = None
        dict_errors, _ = avg_results(model_names, model_names_with_covs)

        diff_errors_no_covs = (dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / dict_errors['id']['no_covs'] * 100
        print(' & '.join([color(x) for x in diff_errors_no_covs.reshape(-1).tolist()]))
    
    else:
        print(model)
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = [f'../output/{model}_covariates_{dataset}.txt' for dataset in datasets]
        dict_errors, _ = avg_results(model_names, model_names_with_covs)

        diff_errors_no_covs = (dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / dict_errors['id']['no_covs'] * 100
        print(' & '.join([color(x) for x in diff_errors_no_covs.reshape(-1).tolist()]))

arima
\multicolumn{2}{c}{\textcolor{red}{+11.59\%}} & \multicolumn{2}{c}{\textcolor{red}{+12.01\%}} & \multicolumn{2}{c}{\textcolor{red}{+2.02\%}} & \multicolumn{2}{c}{\textcolor{red}{+1.49\%}} & \multicolumn{2}{c}{\textcolor{red}{+38.56\%}} & \multicolumn{2}{c}{\textcolor{red}{+31.81\%}} &  \multicolumn{2}{c}{\textcolor{blue}{-4.79\%}} &  \multicolumn{2}{c}{\textcolor{blue}{-5.08\%}} & \multicolumn{2}{c}{\textcolor{red}{+18.47\%}} & \multicolumn{2}{c}{\textcolor{red}{+18.56\%}}
linreg
\multicolumn{2}{c}{\textcolor{red}{+2.51\%}} & \multicolumn{2}{c}{\textcolor{red}{+1.29\%}} & \multicolumn{2}{c}{\textcolor{red}{+1.27\%}} & \multicolumn{2}{c}{\textcolor{red}{+1.38\%}} & \multicolumn{2}{c}{\textcolor{red}{+30.01\%}} & \multicolumn{2}{c}{\textcolor{red}{+19.41\%}} & \multicolumn{2}{c}{\textcolor{red}{+6.46\%}} & \multicolumn{2}{c}{\textcolor{red}{+4.58\%}} & \multicolumn{2}{c}{\textcolor{red}{+14.5\%}} & \multicolumn{2}{c}{\textcolor{red}{+14.22\%}}
xgboost
 \multicolumn{2}{c}{\textcolor

In [15]:
datasets = ['iglu', 'colas', 'dubosson', 'hall', 'weinstock'] # iglu is Broll in the paper, otherwise alphabetical order
models = ['arima', 'linreg', 'xgboost', 'gluformer', 'latentode',  'nhits', 'tft', 'transformer']

def color(x):
    return r'\textcolor{red}{+' + str(round(x, 2)) + '\%}' if x > 0 else r'\textcolor{blue}{' + str(round(x, 2)) + '\%}'

for model in models:
    if model in ['arima', 'gluformer', 'latentode']: # no covariates
        pass   
    else:
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = [f'../output/{model}_covariates_{dataset}.txt' for dataset in datasets]
        dict_errors, _ = avg_results(model_names, model_names_with_covs)

        print(model)
        diff_errors = (dict_errors['id']['covs'] - dict_errors['id']['no_covs']) / dict_errors['id']['no_covs'] * 100
        diff_errors = diff_errors.tolist()
        print(' & '.join([color(y) for x in diff_errors for y in x]))

linreg
\textcolor{blue}{-14.82\%} & \textcolor{blue}{-13.34\%} & \textcolor{red}{+5.54\%} & \textcolor{red}{+5.75\%} & \textcolor{red}{+2.84\%} & \textcolor{red}{+0.61\%} & \textcolor{red}{+6.17\%} & \textcolor{red}{+5.09\%} & \textcolor{blue}{-1.54\%} & \textcolor{blue}{-1.08\%}
xgboost
\textcolor{red}{+8.53\%} & \textcolor{red}{+3.22\%} & \textcolor{blue}{-0.72\%} & \textcolor{blue}{-0.5\%} & \textcolor{blue}{-1.36\%} & \textcolor{blue}{-2.8\%} & \textcolor{red}{+6.22\%} & \textcolor{red}{+7.11\%} & \textcolor{red}{+0.99\%} & \textcolor{red}{+1.36\%}
nhits
\textcolor{red}{+17.43\%} & \textcolor{red}{+21.29\%} & \textcolor{red}{+53.14\%} & \textcolor{red}{+59.48\%} & \textcolor{red}{+74.38\%} & \textcolor{red}{+89.1\%} & \textcolor{red}{+6.21\%} & \textcolor{red}{+8.05\%} & \textcolor{red}{+0.88\%} & \textcolor{red}{+0.93\%}
tft
\textcolor{red}{+6.86\%} & \textcolor{red}{+12.21\%} & \textcolor{red}{+15.86\%} & \textcolor{red}{+15.93\%} & \textcolor{red}{+0.35\%} & \textcolor{red}{+0.0

In [18]:
datasets = ['iglu', 'colas', 'dubosson', 'hall', 'weinstock'] # iglu is Broll in the paper, otherwise alphabetical order
models = ['arima', 'linreg', 'xgboost', 'gluformer', 'latentode',  'nhits', 'tft', 'transformer']

def color(x):
    return r'\multicolumn{2}{c}{\textcolor{red}{+' + str(round(x, 2)) + '\%}}' if x > 0 else r' \multicolumn{2}{c}{\textcolor{blue}{' + str(round(x, 2)) + '\%}}'

for model in models:
    if model in ['arima', 'gluformer', 'latentode']: # no covariates
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = None
        dict_errors, _ = avg_results(model_names, model_names_with_covs)

        print(r'\multirow{2}{*}{\rotatebox{90}{'+ model[:3].upper() + r'}} & \crossmark & ')
        print(r'ID & ' + '&'.join([f'{x:.2f}' for x in dict_errors['id']['no_covs'].reshape(-1).tolist()]))

        print(r'\\')

        print('& \crossmark & ')
        print(r'OOD & ' + '&'.join([f'{x:.2f}' for x in dict_errors['ood']['no_covs'].reshape(-1).tolist()]))

        print(r'\\')
        print('\midrule')
        print(r'\rowcolor{lightgray}')

        print(r'\multicolumn{3}{c|}{$\min \Delta$(ID, OD)\%}&')
        diff_errors = np.mean((dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / 
                              dict_errors['id']['no_covs'] * 100, 
                              axis=1)
        print(' & '.join([color(x) for x in diff_errors.reshape(-1).tolist()]))

        print(r'\\')
        print('\midrule')
    
    else:
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = [f'../output/{model}_covariates_{dataset}.txt' for dataset in datasets]
        dict_errors, _ = avg_results(model_names, model_names_with_covs)

        print(r'\multirow{6}{*}{\rotatebox{90}{'+ model[:3].upper() + r'}} & \crossmark & ')

        print(r'ID & ' + '&'.join([f'{x:.2f}' for x in dict_errors['id']['no_covs'].reshape(-1).tolist()]))

        print(r'\\')
        
        print('& \checkmark & ')
        print(r'ID & ' + '&'.join([f'{x:.2f}' for x in dict_errors['id']['covs'].reshape(-1).tolist()]))
        
        print(r'\\')
        print('\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}')
        
        print('& \multicolumn{2}{c|}{Improv.} &')  
        diff_errors = np.mean((dict_errors['id']['covs'] - dict_errors['id']['no_covs']) / 
                              dict_errors['id']['no_covs'] * 100,
                                axis=1)
        print(' & '.join([color(x) for x in diff_errors.reshape(-1).tolist()]))
        
        print(r'\\')
        print('\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}')
        
        print('& \crossmark &')
        print(r'OD & ' + '&'.join([f'{x:.2f}' for x in dict_errors['ood']['no_covs'].reshape(-1).tolist()]))

        print(r'\\')
        
        print('& \checkmark & ')
        print(r'OD & ' + '&'.join([f'{x:.2f}' for x in dict_errors['ood']['covs'].reshape(-1).tolist()]))
        
        print(r'\\')
        print('\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}')
        
        print('& \multicolumn{2}{c|}{Improv.} &')  
        diff_errors = np.mean((dict_errors['ood']['covs'] - dict_errors['ood']['no_covs']) / 
                              dict_errors['ood']['no_covs'] * 100,
                                axis=1)
        print(' & '.join([color(x) for x in diff_errors.reshape(-1).tolist()]))

        print(r'\\')
        print('\midrule')
        print(r'\rowcolor{lightgray}')

        print(r'\multicolumn{3}{c|}{$\min \Delta$(ID, OD)\%}&')
        diff_errors_no_covs = np.mean((dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / 
                                       dict_errors['id']['no_covs'] * 100,
                                       axis=1)
        diff_errors_covs = np.mean((dict_errors['ood']['covs'] - dict_errors['id']['covs']) /
                                    dict_errors['id']['covs'] * 100,
                                    axis=1)
        diff_errors = np.minimum(diff_errors_no_covs, diff_errors_covs)
        print(' & '.join([color(x) for x in diff_errors.reshape(-1).tolist()]))

        print(r'\\')
        print('\midrule')

\multirow{2}{*}{\rotatebox{90}{ARI}} & \crossmark & 
ID & 10.53&8.67&5.80&4.80&13.53&11.06&8.63&7.34&13.40&11.25
\\
& \crossmark & 
OOD & 11.75&9.71&5.91&4.87&18.75&14.58&8.22&6.97&15.87&13.34
\\
\midrule
\rowcolor{lightgray}
\multicolumn{3}{c|}{$\min \Delta$(ID, OD)\%}&
\multicolumn{2}{c}{\textcolor{red}{+11.8\%}} & \multicolumn{2}{c}{\textcolor{red}{+1.75\%}} & \multicolumn{2}{c}{\textcolor{red}{+35.18\%}} &  \multicolumn{2}{c}{\textcolor{blue}{-4.94\%}} & \multicolumn{2}{c}{\textcolor{red}{+18.51\%}}
\\
\midrule
\multirow{6}{*}{\rotatebox{90}{LIN}} & \crossmark & 
ID & 11.68&9.71&5.26&4.35&12.07&9.97&7.38&6.33&13.60&11.46
\\
& \checkmark & 
ID & 9.95&8.41&5.56&4.60&12.41&10.03&7.84&6.66&13.39&11.34
\\
\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}
& \multicolumn{2}{c|}{Improv.} &
 \multicolumn{2}{c}{\textcolor{blue}{-14.08\%}} & \multicolumn{2}{c}{\textcolor{red}{+5.65\%}} & \multicolumn{2}{c}{\textcolor{red}{+1.73\%}} & \multicolu

# Table 2: probabilistic fit

In [8]:
datasets = ['iglu', 'colas', 'dubosson', 'hall', 'weinstock'] # iglu is Broll in the paper, otherwise alphabetical order
models = ['linreg', 'xgboost', 'gluformer', 'latentode',  'nhits', 'tft', 'transformer']

def color_min(x):
    return r'\textcolor{red}{+' + str(round(x, 2)) + '\%}' if x > 0 else r'\textcolor{blue}{' + str(round(x, 2)) + '\%}'
def color_max(x):
    return r'\textcolor{blue}{+' + str(round(x, 2)) + '\%}' if x > 0 else r'\textcolor{red}{' + str(round(x, 2)) + '\%}'

for model in models:
    if model in ['arima', 'gluformer', 'latentode']: # no covariates
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = None
        _, dict_errors = avg_results(model_names, model_names_with_covs)

        print(model)
        diff_errors = (dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100
        string = [[color_max(x[0]), color_min(x[1])] for x in diff_errors.tolist()]
        string = [item for sublist in string for item in sublist]
        print(' & '.join(string))
    
    else:
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = [f'../output/{model}_covariates_{dataset}.txt' for dataset in datasets]
        _, dict_errors = avg_results(model_names, model_names_with_covs)

        print(model)
        diff_errors_no_covs = (dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100
        string = [[color_max(x[0]), color_min(x[1])] for x in diff_errors_no_covs.tolist()]
        string = [item for sublist in string for item in sublist]
        print(' & '.join(string))

linreg
\textcolor{red}{-0.65\%} & \textcolor{red}{+24.98\%} & \textcolor{blue}{+0.33\%} & \textcolor{red}{+2.93\%} & \textcolor{red}{-0.02\%} & \textcolor{blue}{-7.62\%} & \textcolor{blue}{+0.33\%} & \textcolor{red}{+4.43\%} & \textcolor{red}{-0.85\%} & \textcolor{blue}{-3.1\%}
xgboost
\textcolor{red}{-0.88\%} & \textcolor{red}{+67.42\%} & \textcolor{blue}{+0.59\%} & \textcolor{blue}{-4.55\%} & \textcolor{blue}{+3.16\%} & \textcolor{red}{+5.02\%} & \textcolor{blue}{+1.16\%} & \textcolor{blue}{-14.37\%} & \textcolor{red}{-0.83\%} & \textcolor{red}{+2.45\%}
gluformer
\textcolor{blue}{+6.72\%} & \textcolor{red}{+106.76\%} & \textcolor{red}{-50.33\%} & \textcolor{blue}{-29.83\%} & \textcolor{blue}{+45.64\%} & \textcolor{red}{+83.63\%} & \textcolor{blue}{+7.69\%} & \textcolor{red}{+9.24\%} & \textcolor{blue}{+3.33\%} & \textcolor{red}{+6.23\%}
latentode
\textcolor{red}{-13.67\%} & \textcolor{red}{+6.5\%} & \textcolor{blue}{+15.89\%} & \textcolor{blue}{-4.01\%} & \textcolor{blue}{+42.14\%} &

  diff_errors_no_covs = (dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100


In [9]:
datasets = ['iglu', 'colas', 'dubosson', 'hall', 'weinstock'] # iglu is Broll in the paper, otherwise alphabetical order
models = ['linreg', 'xgboost', 'gluformer', 'latentode',  'nhits', 'tft', 'transformer']

def color_min(x):
    return r'\textcolor{red}{+' + str(round(x, 2)) + '\%}' if x > 0 else r'\textcolor{blue}{' + str(round(x, 2)) + '\%}'
def color_max(x):
    return r'\textcolor{blue}{+' + str(round(x, 2)) + '\%}' if x > 0 else r'\textcolor{red}{' + str(round(x, 2)) + '\%}'

for model in models:
    if model in ['arima', 'gluformer', 'latentode']: # no covariates
        pass
    else:
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = [f'../output/{model}_covariates_{dataset}.txt' for dataset in datasets]
        _, dict_errors = avg_results(model_names, model_names_with_covs)

        print(model)
        diff_errors = (dict_errors['id']['covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100
        string = [[color_max(x[0]), color_min(x[1])] for x in diff_errors.tolist()]
        string = [item for sublist in string for item in sublist]
        print(' & '.join(string))

linreg
\textcolor{blue}{+0.15\%} & \textcolor{red}{+5.99\%} & \textcolor{blue}{+0.25\%} & \textcolor{red}{+24.47\%} & \textcolor{red}{-0.41\%} & \textcolor{red}{+11.39\%} & \textcolor{red}{-7.67\%} & \textcolor{red}{+97.36\%} & \textcolor{blue}{+0.13\%} & \textcolor{blue}{-1.67\%}
xgboost
\textcolor{red}{-1.22\%} & \textcolor{red}{+0.59\%} & \textcolor{blue}{+0.12\%} & \textcolor{blue}{-7.2\%} & \textcolor{blue}{+0.13\%} & \textcolor{blue}{-6.98\%} & \textcolor{red}{-0.31\%} & \textcolor{blue}{-1.3\%} & \textcolor{red}{-0.15\%} & \textcolor{blue}{-4.62\%}
nhits
\textcolor{red}{-3.63\%} & \textcolor{blue}{-37.79\%} & \textcolor{red}{-1.68\%} & \textcolor{red}{+91.12\%} & \textcolor{red}{-4.19\%} & \textcolor{blue}{-21.68\%} & \textcolor{red}{-0.07\%} & \textcolor{blue}{-24.77\%} & \textcolor{red}{-0.01\%} & \textcolor{blue}{-5.46\%}
tft
\textcolor{red}{nan\%} & \textcolor{red}{+94.6\%} & \textcolor{red}{nan\%} & \textcolor{red}{+114.61\%} & \textcolor{red}{nan\%} & \textcolor{red}{+7.57

  diff_errors = (dict_errors['id']['covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100


In [4]:
datasets = ['iglu', 'colas', 'dubosson', 'hall', 'weinstock'] # iglu is Broll in the paper, otherwise alphabetical order
models = ['linreg', 'xgboost', 'gluformer', 'latentode',  'nhits', 'tft', 'transformer']

def color_min(x):
    return r'\textcolor{red}{+' + str(round(x, 2)) + '\%}' if x > 0 else r'\textcolor{blue}{' + str(round(x, 2)) + '\%}'
def color_max(x):
    return r'\textcolor{blue}{+' + str(round(x, 2)) + '\%}' if x > 0 else r'\textcolor{red}{' + str(round(x, 2)) + '\%}'

for model in models:
    if model in ['arima', 'gluformer', 'latentode']: # no covariates
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = None
        _, dict_errors = avg_results(model_names, model_names_with_covs)

        print(r'\multirow{2}{*}{\rotatebox{90}{'+ model[:3].upper() + r'}} & \crossmark & ')
        print(r'ID & ' + '&'.join([f'{x:.2f}' for x in dict_errors['id']['no_covs'].reshape(-1).tolist()]))

        print(r'\\')

        print('& \crossmark & ')
        print(r'OOD & ' + '&'.join([f'{x:.2f}' for x in dict_errors['ood']['no_covs'].reshape(-1).tolist()]))

        print(r'\\')
        print('\midrule')
        print(r'\rowcolor{lightgray}')

        print(r'\multicolumn{3}{c|}{$\min \Delta$(ID, OD)\%}&')
        diff_errors = (dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100
        string = [[color_max(x[0]), color_min(x[1])] for x in diff_errors.tolist()]
        string = [item for sublist in string for item in sublist]
        print(' & '.join(string))

        print(r'\\')
        print('\midrule')
    
    else:
        model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
        model_names_with_covs = [f'../output/{model}_covariates_{dataset}.txt' for dataset in datasets]
        _, dict_errors = avg_results(model_names, model_names_with_covs)

        print(r'\multirow{6}{*}{\rotatebox{90}{'+ model[:3].upper() + r'}} & \crossmark & ')

        print(r'ID & ' + '&'.join([f'{x:.2f}' for x in dict_errors['id']['no_covs'].reshape(-1).tolist()]))

        print(r'\\')
        
        print('& \checkmark & ')
        print(r'ID & ' + '&'.join([f'{x:.2f}' for x in dict_errors['id']['covs'].reshape(-1).tolist()]))
        
        print(r'\\')
        print('\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}')
        
        print('& \multicolumn{2}{c|}{Improv.} &')  
        diff_errors = (dict_errors['id']['covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100
        string = [[color_max(x[0]), color_min(x[1])] for x in diff_errors.tolist()]
        string = [item for sublist in string for item in sublist]
        print(' & '.join(string))
        
        print(r'\\')
        print('\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}')
        
        print('& \crossmark &')
        print(r'OD & ' + '&'.join([f'{x:.2f}' for x in dict_errors['ood']['no_covs'].reshape(-1).tolist()]))

        print(r'\\')
        
        print('& \checkmark & ')
        print(r'OD & ' + '&'.join([f'{x:.2f}' for x in dict_errors['ood']['covs'].reshape(-1).tolist()]))
        
        print(r'\\')
        print('\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}')
        
        print('& \multicolumn{2}{c|}{Improv.} &')  
        diff_errors = (dict_errors['ood']['covs'] - dict_errors['ood']['no_covs']) / np.abs(dict_errors['ood']['no_covs']) * 100
        string = [[color_max(x[0]), color_min(x[1])] for x in diff_errors.tolist()]
        string = [item for sublist in string for item in sublist]
        print(' & '.join(string))

        print(r'\\')
        print('\midrule')
        print(r'\rowcolor{lightgray}')

        print(r'\multicolumn{3}{c|}{$\min \Delta$(ID, OD)\%}&')
        diff_errors_no_covs = (dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100
        diff_errors_covs = (dict_errors['ood']['covs'] - dict_errors['id']['covs']) / np.abs(dict_errors['id']['covs']) * 100
        diff_errors = np.empty_like(diff_errors_no_covs)
        diff_errors[:, 0] = np.maximum(diff_errors_no_covs[:, 0], diff_errors_covs[:, 0])
        diff_errors[:, 1] = np.minimum(diff_errors_no_covs[:, 1], diff_errors_covs[:, 1])
        string = [[color_max(x[0]), color_min(x[1])] for x in diff_errors.tolist()]
        string = [item for sublist in string for item in sublist]
        print(' & '.join(string))

        print(r'\\')
        print('\midrule')

\multirow{6}{*}{\rotatebox{90}{LIN}} & \crossmark & 
ID & -9.89&0.12&-9.19&0.15&-10.10&0.18&-9.56&0.10&-10.14&0.11
\\
& \checkmark & 
ID & -9.87&0.13&-9.17&0.19&-10.15&0.21&-10.30&0.19&-10.12&0.11
\\
\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}
& \multicolumn{2}{c|}{Improv.} &
\textcolor{blue}{+0.15\%} & \textcolor{red}{+5.99\%} & \textcolor{blue}{+0.25\%} & \textcolor{red}{+24.47\%} & \textcolor{red}{-0.41\%} & \textcolor{red}{+11.39\%} & \textcolor{red}{-7.67\%} & \textcolor{red}{+97.36\%} & \textcolor{blue}{+0.13\%} & \textcolor{blue}{-1.67\%}
\\
\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}
& \crossmark &
OD & -9.95&0.15&-9.16&0.15&-10.11&0.17&-9.53&0.10&-10.22&0.11
\\
& \checkmark & 
OD & -10.24&0.55&-9.16&0.17&-12.08&0.48&-10.42&0.23&-11.13&0.21
\\
\cmidrule(lr){4-5} \cmidrule(lr){6-7} \cmidrule(lr){8-9} \cmidrule(lr){10-11} \cmidrule(lr){12-13}
& \multicolumn{2}{c|}{Improv.

  diff_errors = (dict_errors['id']['covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100
  diff_errors = (dict_errors['ood']['covs'] - dict_errors['ood']['no_covs']) / np.abs(dict_errors['ood']['no_covs']) * 100
  diff_errors_no_covs = (dict_errors['ood']['no_covs'] - dict_errors['id']['no_covs']) / np.abs(dict_errors['id']['no_covs']) * 100
  diff_errors_covs = (dict_errors['ood']['covs'] - dict_errors['id']['covs']) / np.abs(dict_errors['id']['covs']) * 100


# Table 1: short version

In [3]:
datasets = ['iglu', 'colas', 'dubosson', 'hall', 'weinstock'] # iglu is Broll in the paper, otherwise alphabetical order
models = ['arima', 'linreg', 'xgboost', 'gluformer', 'latentode',  'nhits', 'tft', 'transformer']

model_errors = []
for model in models:
    model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
    model_names_with_covs = None
    dict_errors, _ = avg_results(model_names, model_names_with_covs)
    model_errors.append(dict_errors['id']['no_covs'])

In [4]:
model_errors = np.mean(np.array(model_errors), axis=-1)
# find best models with lowest error
best_models = np.argsort(model_errors, axis=0)
best_models_set = []
for i, dataset in enumerate(datasets):
    bm_dataset = [models[j] for j in best_models[:, i]]
    print(f'{dataset}: {bm_dataset[0]}')
    best_models_set.append(bm_dataset[0])
print(set(best_models_set))

iglu: arima
colas: linreg
dubosson: linreg
hall: latentode
weinstock: transformer
{'latentode', 'transformer', 'arima', 'linreg'}


# Table 2: short version

In [17]:
datasets = ['iglu', 'colas', 'dubosson', 'hall', 'weinstock'] # iglu is Broll in the paper, otherwise alphabetical order
models = ['arima', 'linreg', 'xgboost', 'gluformer', 'latentode',  'nhits', 'tft', 'transformer']

model_errors = []
for model in models:
    model_names = [f'../output/{model}_{dataset}.txt' for dataset in datasets]
    model_names_with_covs = None
    _, dict_errors = avg_results(model_names, model_names_with_covs)
    model_errors.append(dict_errors['id']['no_covs'])

In [26]:
model_likelihood = np.array(model_errors)[:,:,0]
model_cal = np.array(model_errors)[:,:,1]
# find best models with highest likelihood
model_likelihood[np.isnan(model_likelihood)] = -np.inf
model_likelihood[model_likelihood == 0] = -np.inf # models that have exactly 0 likelihood do not support likelihood
best_models = np.argsort(model_likelihood, axis=0)
best_models_set = []
for i, dataset in enumerate(datasets):
    bm_dataset = [models[j] for j in best_models[:, i]]
    print(f'{dataset}: {bm_dataset[-1]}')
    best_models_set.append(bm_dataset[-1])
print(set(best_models_set))

# find best models with lowest cal error
model_cal[np.isnan(model_cal)] = np.inf
best_models = np.argsort(model_cal, axis=0)
best_models_set = []
for i, dataset in enumerate(datasets):
    bm_dataset = [models[j] for j in best_models[:, i]]
    print(f'{dataset}: {bm_dataset[0]}')
    best_models_set.append(bm_dataset[0])
print(set(best_models_set))



iglu: gluformer
colas: gluformer
dubosson: gluformer
hall: gluformer
weinstock: gluformer
{'gluformer'}
iglu: gluformer
colas: tft
dubosson: gluformer
hall: gluformer
weinstock: tft
{'gluformer', 'tft'}


# Clearing the results

In [None]:
import os

# Set the directory path to the folder containing the output files
folder_path = './output'

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    # Open the file in read mode if the file starts with transformer
    if filename.startswith('transformer') or \
            filename.startswith('tft') or \
                filename.startswith('linreg') or \
                    filename.startswith('xgboost') or \
                        filename.startswith('nhits'):
        with open(file_path, 'r') as f:
            lines = f.readlines()
            
        # Loop through the lines in reverse order
        for i in range(len(lines)-1, -1, -1):
            if lines[i].startswith('Best value: '):
                # Delete all lines after the line starting with "Best value: "
                del lines[i+1:]
                break
        
        # Open the file in write mode and write the modified lines back to the file
        with open(file_path, 'w') as f:
            f.writelines(lines)


In [32]:
open = ['heBloodGlucoseConcentration2020',
'eren-orukluHypoglycemiaPredictionSubjectSpecific2010',
'balakrishnanPersonalizedHybridModels2013',
'sunPredictingBloodGlucose2018',
'indrawanBloodGlucosePrediction2021',
'georgaEvaluationShorttermPredictors2015',
'dengDeepTransferLearning2021',
'vandoornMachineLearningbasedGlucose2021',
'zhuDeepLearningAlgorithma',
'martinssonBloodGlucosePrediction2020',
'vandoornMachineLearningbasedGlucose2021',
'munoz-organeroDeepPhysiologicalModel2020',
'jaloliLongtermPredictionBlood2022',
'foxDeepMultiOutputForecasting2018a',
'armandpourDeepPersonalizedGlucose2021a',
'sergazinov2022gluformer',
'jaloliLongtermPredictionBlood2022',
]
closed = [
'liConvolutionalRecurrentNeural2020', 
'langaricaMetalearningApproachPersonalized2023', 
'liuEnhancingBloodGlucose2018',
'reymannBloodGlucoseLevel2016', 
'boirouxOvernightControlBlood2012',
'otoomRealTimeStatisticalModeling2015',
'boirouxOvernightControlBlood2012', 
'bockTherapyParameterbasedModel2015', 
'calm2011comparison', 
'de2012prediction', 
'fang2015new', 
'laguna2014identification',
'xuBloodGlucosePrediction2022', 
'liConvolutionalRecurrentNeural2020', 
'prendinForecastingGlucoseLevels2021', 
'alibertiMultiPatientDataDrivenApproach2019', 
'liuEnhancingBloodGlucose2018', 
'benaliContinuousBloodGlucose2018', 
'shi2015glucose',
'prendinForecastingGlucoseLevels2021', 
'yangARIMAModelAdaptive2019', 
'sudharsanHypoglycemiaPredictionUsing2015', 
'hidalgoDataBasedPrediction2017', 
'efendic2014short', 
'botwey2014multi', 
'wang2013novel', 
'zarkogianni2014neuro',
'gyukBloodGlucoseLevel2019', 
'novaraNonlinearBlindIdentification2016', 
'bockTherapyParameterbasedModel2015', 
'chen2010modeling', 
'duun2013model', 
'laguna2014experimental', 
'wu2011physiological', 
'zhang2016data'
]

In [42]:
nocode = ['liConvolutionalRecurrentNeural2020', 
'langaricaMetalearningApproachPersonalized2023', 
'liuEnhancingBloodGlucose2018', 
'bockTherapyParameterbasedModel2015', 
'balakrishnanPersonalizedHybridModels2013', 
'armandpourDeepPersonalizedGlucose2021a',
'hidalgoDataBasedPrediction2017', 
'georgaEvaluationShorttermPredictors2015', 
'sudharsanHypoglycemiaPredictionUsing2015', 
'novaraNonlinearBlindIdentification2016', 
'otoomRealTimeStatisticalModeling2015', 
'boirouxOvernightControlBlood2012', 
'eren-orukluHypoglycemiaPredictionSubjectSpecific2010', 
'yangARIMAModelAdaptive2019',
'benaliContinuousBloodGlucose2018', 
'zhuDeepLearningAlgorithma', 
'alibertiMultiPatientDataDrivenApproach2019', 
'prendinForecastingGlucoseLevels2021', 
'gyukBloodGlucoseLevel2019',
'vandoornMachineLearningbasedGlucose2021', 
'jaloliLongtermPredictionBlood2022',
'reymannBloodGlucoseLevel2016', 
'dengDeepTransferLearning2021',
'botwey2014multi',
'calm2011comparison',
'chen2010modeling',
'de2012prediction',
'duun2013model',
'efendic2014short',
'fang2015new',
'laguna2014experimental',
'laguna2014identification',
'shi2015glucose',
'sunPredictingBloodGlucose2018',
'wang2013novel',
'wu2011physiological',
'zarkogianni2014neuro',
'zhang2016data'
]
pseudocode = [
    'munoz-organeroDeepPhysiologicalModel2020', 
    'foxDeepMultiOutputForecasting2018a', 
    'indrawanBloodGlucosePrediction2021', 
    'heBloodGlucoseConcentration2020', 
    'langaricaMetalearningApproachPersonalized2023', 
    'xuBloodGlucosePrediction2022'
]
code = [
    'sergazinov2022gluformer', 
    'martinssonBloodGlucosePrediction2020'
]

In [43]:
code_nocode = set(code).union(set(pseudocode)).union(set(nocode))
open_closed = set(open).union(set(closed))

In [44]:
code_nocode.difference(open_closed)

set()

In [49]:
', '.join(list(set(code)))

'sergazinov2022gluformer, martinssonBloodGlucosePrediction2020'

In [47]:
len(set(nocode))

38