# Load libraries

In [1]:
# load pandas and numpy
import pandas as pd
import numpy as np
import re
import os
import sys

# Read and create table for no-covariates models

In [22]:
models = ['nhits', 'tft', 'linreg', 'xgboost', 'transformer']
datasets = ['weinstock', 'dubosson', 'colas', 'iglu', 'hall']
time_steps = 12

# create dataframes for MSE and MAE with columns for each dataset and rows for each model
df_mse = pd.DataFrame(columns=[dataset + ' ID' for dataset in datasets] + 
                      [dataset + ' OOD' for dataset in datasets], index=models)
df_mae = pd.DataFrame(columns=[dataset + ' ID' for dataset in datasets] +
                        [dataset + ' OOD' for dataset in datasets], index=models)
df_likelihood = pd.DataFrame(columns=[dataset + ' ID' for dataset in datasets] +
                        [dataset + ' OOD' for dataset in datasets], index=models)
df_calibration = pd.DataFrame(columns=[dataset + ' ID' for dataset in datasets] +
                        [dataset + ' OOD' for dataset in datasets], index=models)
for model in models:
    for dataset in datasets:
        filename = f"./output/{model}_{dataset}.txt"
        if not os.path.isfile(filename):
            continue
        with open(filename, 'r') as f:
            for line in f:
                if line.startswith('ID median of (MSE, MAE):'):
                    id_mse_mae = re.findall(r'\d+\.\d+', line)
                    df_mse.loc[model, dataset + ' ID'] = float(id_mse_mae[0])
                    df_mae.loc[model, dataset + ' ID'] = float(id_mse_mae[1])
                elif line.startswith('OOD median of (MSE, MAE):'):
                    ood_mse_mae = re.findall(r'\d+\.\d+', line)
                    df_mse.loc[model, dataset + ' OOD'] = float(ood_mse_mae[0])
                    df_mae.loc[model, dataset + ' OOD'] = float(ood_mse_mae[1])
                elif line.startswith('ID likelihoods:'):
                    id_likelihoods = re.findall(r'-?\d+\.\d+', line)
                    df_likelihood.loc[model, dataset + ' ID'] = float(id_likelihoods[0])
                elif line.startswith('OOD likelihoods:'):
                    ood_likelihoods = re.findall(r'-?\d+\.\d+', line)
                    df_likelihood.loc[model, dataset + ' OOD'] = float(ood_likelihoods[0])
                elif line.startswith('ID calibration errors:'):
                    id_calib = re.findall(r'-?\d+\.\d+', line)
                    id_calib = np.mean([float(x) for x in id_calib[:time_steps]])
                    df_calibration.loc[model, dataset + ' ID'] = id_calib
                elif line.startswith('OOD calibration errors:'):
                    ood_calib = re.findall(r'-?\d+\.\d+', line)
                    ood_calib = np.mean([float(x) for x in ood_calib[:time_steps]])
                    df_calibration.loc[model, dataset + ' OOD'] = ood_calib
# sort columns for all dataframes
df_mse = df_mse.reindex(sorted(df_mse.columns), axis=1)
df_mae = df_mae.reindex(sorted(df_mae.columns), axis=1)
df_likelihood = df_likelihood.reindex(sorted(df_likelihood.columns), axis=1)
df_calibration = df_calibration.reindex(sorted(df_calibration.columns), axis=1)


In [23]:
df_mae

Unnamed: 0,colas ID,colas OOD,dubosson ID,dubosson OOD,hall ID,hall OOD,iglu ID,iglu OOD,weinstock ID,weinstock OOD
nhits,5.086399,4.791936,14.372684,14.865962,6.590668,6.675364,12.401434,11.538443,10.967913,12.290077
tft,,,,,,,,,,
linreg,4.314887,4.370194,10.014688,12.119988,6.290309,6.180986,8.483003,9.545128,11.358229,12.922933
xgboost,5.453732,5.294375,18.631481,14.966013,6.517771,6.475905,11.473368,8.942711,11.610262,13.040385
transformer,,,,,,,,,,


# Read and create table for with-covariates models

In [24]:
models = ['nhits', 'tft', 'linreg', 'xgboost', 'transformer']
datasets = ['weinstock', 'dubosson', 'colas', 'iglu', 'hall']
time_steps = 12

# create dataframes for MSE and MAE with columns for each dataset and rows for each model
df_mse = pd.DataFrame(columns=[dataset + ' ID' for dataset in datasets] + 
                      [dataset + ' OOD' for dataset in datasets], index=models)
df_mae = pd.DataFrame(columns=[dataset + ' ID' for dataset in datasets] +
                        [dataset + ' OOD' for dataset in datasets], index=models)
df_likelihood = pd.DataFrame(columns=[dataset + ' ID' for dataset in datasets] +
                        [dataset + ' OOD' for dataset in datasets], index=models)
df_calibration = pd.DataFrame(columns=[dataset + ' ID' for dataset in datasets] +
                        [dataset + ' OOD' for dataset in datasets], index=models)
for model in models:
    for dataset in datasets:
        filename = f"./output/{model}_covariates_{dataset}.txt"
        if not os.path.isfile(filename):
            continue
        with open(filename, 'r') as f:
            for line in f:
                if line.startswith('ID median of (MSE, MAE):'):
                    id_mse_mae = re.findall(r'\d+\.\d+', line)
                    df_mse.loc[model, dataset + ' ID'] = float(id_mse_mae[0])
                    df_mae.loc[model, dataset + ' ID'] = float(id_mse_mae[1])
                elif line.startswith('OOD median of (MSE, MAE):'):
                    ood_mse_mae = re.findall(r'\d+\.\d+', line)
                    df_mse.loc[model, dataset + ' OOD'] = float(ood_mse_mae[0])
                    df_mae.loc[model, dataset + ' OOD'] = float(ood_mse_mae[1])
                elif line.startswith('ID likelihoods:'):
                    id_likelihoods = re.findall(r'-?\d+\.\d+', line)
                    df_likelihood.loc[model, dataset + ' ID'] = float(id_likelihoods[0])
                elif line.startswith('OOD likelihoods:'):
                    ood_likelihoods = re.findall(r'-?\d+\.\d+', line)
                    df_likelihood.loc[model, dataset + ' OOD'] = float(ood_likelihoods[0])
                elif line.startswith('ID calibration errors:'):
                    id_calib = re.findall(r'-?\d+\.\d+', line)
                    id_calib = np.mean([float(x) for x in id_calib[:time_steps]])
                    df_calibration.loc[model, dataset + ' ID'] = id_calib
                elif line.startswith('OOD calibration errors:'):
                    ood_calib = re.findall(r'-?\d+\.\d+', line)
                    ood_calib = np.mean([float(x) for x in ood_calib[:time_steps]])
                    df_calibration.loc[model, dataset + ' OOD'] = ood_calib
# sort columns for all dataframes
df_mse = df_mse.reindex(sorted(df_mse.columns), axis=1)
df_mae = df_mae.reindex(sorted(df_mae.columns), axis=1)
df_likelihood = df_likelihood.reindex(sorted(df_likelihood.columns), axis=1)
df_calibration = df_calibration.reindex(sorted(df_calibration.columns), axis=1)

In [25]:
df_mse

Unnamed: 0,colas ID,colas OOD,dubosson ID,dubosson OOD,hall ID,hall OOD,iglu ID,iglu OOD,weinstock ID,weinstock OOD
nhits,77.689651,54.967499,546.605957,786.21936,67.086929,73.348373,191.314301,160.568817,,
tft,,,,,,,,,,
linreg,35.487167,30.118378,154.04391,4.125549,61.686333,477.12555,119.0223,476.5104,179.3827,245.15425
xgboost,40.640217,40.425087,436.287598,304.15567,64.820625,67.221146,192.971893,93.557358,189.590714,241.654144
transformer,,,,,,,,,,


# Clearing the results

In [1]:
import os

# Set the directory path to the folder containing the output files
folder_path = './output'

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    # Open the file in read mode if the file starts with transformer
    if filename.startswith('transformer'):
        with open(file_path, 'r') as f:
            lines = f.readlines()
            
        # Loop through the lines in reverse order
        for i in range(len(lines)-1, -1, -1):
            if lines[i].startswith('Best value: '):
                # Delete all lines after the line starting with "Best value: "
                del lines[i+1:]
                break
        
        # Open the file in write mode and write the modified lines back to the file
        with open(file_path, 'w') as f:
            f.writelines(lines)
