# Load libraries

In [2]:
# load pandas and numpy
import pandas as pd
import numpy as np
import os
import sys

# Read and create table for no-covariates models

In [3]:
# read in the txt files with the results
models = ['nhits', 'tft', 'linreg', 'xgboost', 'transformer']
datasets = ['weinstock', 'dubosson', 'colas', 'iglu', 'hall']
results = {d:{} for d in [d + '_ID_RMSE' for d in datasets] + 
           [d + '_ID_MAE' for d in datasets] + 
           [d + '_OOD_RMSE' for d in datasets] + 
           [d + '_OOD_MAE' for d in datasets]}
for model in models:
    for dataset in datasets:
        # read txt file
        # find line starting with the 'Key: median RS ID (MSE, MAE) stats'
        # in that line read in values after "'mean': [array([" and before "'])], 'std'"
        # save them in a dictionary
        with open(f'output/{model}_{dataset}.txt', 'r') as f:
            for line in f:
                if line[:34] == 'Key: median RS ID (MSE, MAE) stats':
                    # split line
                    line = line.split("'mean': [array([")[1]
                    line = line.split("])], 'std'")[0]
                    # read in 2 numbers 
                    line = line.split(', ')
                    results[dataset + '_ID_RMSE'][model] = np.sqrt(float(line[0]))
                    results[dataset + '_ID_MAE'][model] = float(line[1])
                if line[:35] == 'Key: median RS OOD (MSE, MAE) stats':
                    # split line
                    line = line.split("'mean': [array([")[1]
                    line = line.split("])], 'std'")[0]
                    # read in 2 numbers 
                    line = line.split(', ')
                    results[dataset + '_OOD_RMSE'][model] = np.sqrt(float(line[0]))
                    results[dataset + '_OOD_MAE'][model] = float(line[1])
                if line[:29] == 'RS ID (MSE, MAE) errors stats':
                    # split line
                    line = line.split("'median': array([[")[1]
                    line = line.split("]]), 'min'")[0]
                    # read in 2 numbers 
                    line = line.split(', ')
                    results[dataset + '_ID_RMSE'][model] = np.sqrt(float(line[0]))
                    results[dataset + '_ID_MAE'][model] = float(line[1])
                if line[:30] == 'RS OOD (MSE, MAE) errors stats':
                    # split line
                    line = line.split("'median': array([[")[1]
                    line = line.split("]]), 'min'")[0]
                    # read in 2 numbers 
                    line = line.split(', ')
                    results[dataset + '_OOD_RMSE'][model] = np.sqrt(float(line[0]))
                    results[dataset + '_OOD_MAE'][model] = float(line[1])
results = pd.DataFrame(results)

In [4]:
# select all columns with ID and order iglu, colas, dubosson, weinstock, hall
results_ID = results[[c for c in results.columns if 'ID' in c]]
results_ID = results_ID[['iglu_ID_RMSE', 'iglu_ID_MAE', 'colas_ID_RMSE', 'colas_ID_MAE', 'dubosson_ID_RMSE', 'dubosson_ID_MAE', 'weinstock_ID_RMSE', 'weinstock_ID_MAE', 'hall_ID_RMSE', 'hall_ID_MAE']]
# save as latex table in output folder
results_ID.to_latex('results_ID.txt', float_format='%.2f')

  results_ID.to_latex('results_ID.txt', float_format='%.2f')


In [5]:
results_ID

Unnamed: 0,iglu_ID_RMSE,iglu_ID_MAE,colas_ID_RMSE,colas_ID_MAE,dubosson_ID_RMSE,dubosson_ID_MAE,weinstock_ID_RMSE,weinstock_ID_MAE,hall_ID_RMSE,hall_ID_MAE
nhits,14.039449,12.038409,6.397849,5.458222,17.186538,14.454458,13.852764,11.905748,8.318731,7.244681
tft,18.323062,14.217684,9.344972,6.892426,29.377873,22.343989,25.970159,19.512877,12.193347,9.714242
linreg,10.749203,9.050826,5.073029,4.232943,12.257894,10.014687,13.525581,11.358228,7.566525,6.377251
xgboost,13.126929,11.473368,6.389259,5.453731,20.536512,18.631482,13.976567,11.98296,7.550505,6.517771
transformer,16.305057,14.206382,6.081671,5.1649,16.160907,13.45761,12.849728,10.801733,7.761739,6.599094


In [29]:
# select all columns with OOD and order iglu, colas, dubosson, weinstock, hall
results_OOD = results[[c for c in results.columns if 'OOD' in c]]
results_OOD = results_OOD[['iglu_OOD_RMSE', 'iglu_OOD_MAE', 'colas_OOD_RMSE', 'colas_OOD_MAE', 'dubosson_OOD_RMSE', 'dubosson_OOD_MAE', 'weinstock_OOD_RMSE', 'weinstock_OOD_MAE', 'hall_OOD_RMSE', 'hall_OOD_MAE']]
# save as latex table in output folder
results_OOD.to_latex('results_OOD.txt', float_format='%.2f')

In [30]:
results_OOD

Unnamed: 0,iglu_OOD_RMSE,iglu_OOD_MAE,colas_OOD_RMSE,colas_OOD_MAE,dubosson_OOD_RMSE,dubosson_OOD_MAE,weinstock_OOD_RMSE,weinstock_OOD_MAE,hall_OOD_RMSE,hall_OOD_MAE
nhits,14.390728,12.278319,5.987807,5.106096,19.708129,16.853706,15.155061,12.963381,8.432969,7.229441
tft,19.431707,14.786398,8.974353,6.616918,28.992374,21.636499,27.333931,20.479208,11.998969,9.521888
linreg,11.625791,9.556995,5.156734,4.310159,15.747926,12.119989,15.338169,12.922932,7.411944,6.249571
xgboost,10.004812,8.94271,6.175686,5.294375,17.054236,14.966012,15.783859,13.501627,7.441405,6.475905
transformer,16.546987,14.888153,5.97323,5.106396,17.060662,13.947294,14.483493,12.173853,7.855519,6.809893


# Read and create table for with-covariates models

In [31]:
# read in the txt files with the results
models = ['nhits_covariates', 'tft_covariates', 'linreg_covariates', 'xgboost_covariates', 'transformer_covariates']
datasets = ['weinstock', 'dubosson', 'colas', 'iglu', 'hall']
results = {d:{} for d in [d + '_ID_RMSE' for d in datasets] + 
           [d + '_ID_MAE' for d in datasets] + 
           [d + '_OOD_RMSE' for d in datasets] + 
           [d + '_OOD_MAE' for d in datasets]}
for model in models:
    for dataset in datasets:
        # check if files exist
        if not os.path.exists(f'output/{model}_{dataset}.txt'):
            continue
        # read txt file
        # find line and the values, then save them in a dictionary
        with open(f'output/{model}_{dataset}.txt', 'r') as f:
            for line in f:
                if line[:34] == 'Key: median RS ID (MSE, MAE) stats':
                    # split line
                    line = line.split("'mean': [array([")[1]
                    line = line.split("])], 'std'")[0]
                    # read in 2 numbers 
                    line = line.split(', ')
                    results[dataset + '_ID_RMSE'][model] = np.sqrt(float(line[0]))
                    results[dataset + '_ID_MAE'][model] = float(line[1])
                if line[:35] == 'Key: median RS OOD (MSE, MAE) stats':
                    # split line
                    line = line.split("'mean': [array([")[1]
                    line = line.split("])], 'std'")[0]
                    # read in 2 numbers 
                    line = line.split(', ')
                    results[dataset + '_OOD_RMSE'][model] = np.sqrt(float(line[0]))
                    results[dataset + '_OOD_MAE'][model] = float(line[1])
                if line[:29] == 'RS ID (MSE, MAE) errors stats':
                    # split line
                    line = line.split("'median': array([[")[1]
                    line = line.split("]]), 'min'")[0]
                    # read in 2 numbers 
                    line = line.split(', ')
                    results[dataset + '_ID_RMSE'][model] = np.sqrt(float(line[0]))
                    results[dataset + '_ID_MAE'][model] = float(line[1])
                if line[:30] == 'RS OOD (MSE, MAE) errors stats':
                    # split line
                    line = line.split("'median': array([[")[1]
                    line = line.split("]]), 'min'")[0]
                    # read in 2 numbers 
                    line = line.split(', ')
                    results[dataset + '_OOD_RMSE'][model] = np.sqrt(float(line[0]))
                    results[dataset + '_OOD_MAE'][model] = float(line[1])
results = pd.DataFrame(results)

In [35]:
# select all columns with ID and order iglu, colas, dubosson, weinstock, hall
results_ID = results[[c for c in results.columns if 'ID' in c]]
results_ID = results_ID[['iglu_ID_RMSE', 'iglu_ID_MAE', 'colas_ID_RMSE', 'colas_ID_MAE', 'dubosson_ID_RMSE', 'dubosson_ID_MAE', 'weinstock_ID_RMSE', 'weinstock_ID_MAE', 'hall_ID_RMSE', 'hall_ID_MAE']]
# save as latex table in output folder
results_ID.to_latex('results_covs_ID.txt', float_format='%.2f')

In [36]:
results_ID

Unnamed: 0,iglu_ID_RMSE,iglu_ID_MAE,colas_ID_RMSE,colas_ID_MAE,dubosson_ID_RMSE,dubosson_ID_MAE,weinstock_ID_RMSE,weinstock_ID_MAE,hall_ID_RMSE,hall_ID_MAE
nhits_covariates,19.494204,17.788871,7.418236,6.416239,35.846536,32.96083,13.486854,11.545949,8.538321,7.450222
linreg_covariates,13.221546,10.659633,6.2322,5.113108,12.576938,10.01187,13.393383,11.335308,8.772095,7.49264
xgboost_covariates,17.347108,15.183563,6.351102,5.503934,20.887499,18.553341,13.769194,11.768311,8.051126,7.017419
transformer_covariates,13.675433,11.837907,8.444415,7.459136,22.431571,19.297483,15.137501,13.318898,8.191996,7.122951
tft_covariates,18.597581,15.11744,,,26.028856,20.877859,,,11.446118,9.338712


In [33]:
# select all columns with OOD and order iglu, colas, dubosson, weinstock, hall
results_OOD = results[[c for c in results.columns if 'OOD' in c]]
results_OOD = results_OOD[['iglu_OOD_RMSE', 'iglu_OOD_MAE', 'colas_OOD_RMSE', 'colas_OOD_MAE', 'dubosson_OOD_RMSE', 'dubosson_OOD_MAE', 'weinstock_OOD_RMSE', 'weinstock_OOD_MAE', 'hall_OOD_RMSE', 'hall_OOD_MAE']]
# save as latex table in output folder
results_OOD.to_latex('results_covs_OOD.txt', float_format='%.2f')

In [34]:
results_OOD

Unnamed: 0,iglu_OOD_RMSE,iglu_OOD_MAE,colas_OOD_RMSE,colas_OOD_MAE,dubosson_OOD_RMSE,dubosson_OOD_MAE,weinstock_OOD_RMSE,weinstock_OOD_MAE,hall_OOD_RMSE,hall_OOD_MAE
nhits_covariates,22.743501,20.948529,7.080619,6.190252,38.949433,35.071201,15.103465,12.908895,9.181907,8.126172
linreg_covariates,22620.560824,7713.66129,5.485037,4.525691,24387.455238,11270.3723,15.657402,13.159746,30.585474,17.229848
xgboost_covariates,11.593113,10.675918,6.493228,5.606705,17.440058,15.456431,15.545229,13.430659,8.19885,7.108005
transformer_covariates,16.932616,15.237198,7.453818,6.67496,29.284154,25.987484,16.610092,14.544551,8.418855,7.187983
tft_covariates,18.28228,14.669054,,,23.248962,18.337863,,,11.73673,9.580188
