In [1]:
import os
import sys

os.chdir("..")
sys.path.append("..")

In [2]:
import itertools
import yaml

import numpy as np
import pandas as pd
import torch

from bokeh.io.export import export_svg
from bokeh.layouts import row
from bokeh.plotting import figure
from bokeh.transform import log_cmap, linear_cmap
from bokeh.util.hex import hexbin, cartesian_to_axial
from gluonts.dataset.common import ListDataset
from gluonts.dataset.field_names import FieldName
from gluonts.dataset.loader import ValidationDataLoader
from gluonts.dataset.repository.datasets import get_dataset
from gluonts.time_feature import (
    HourOfDay,
    DayOfWeek,
    DayOfMonth,
    DayOfYear,
    MonthOfYear
)
from gluonts.torch.batchify import batchify
from gluonts.transform import (
    AddObservedValuesIndicator,
    AddTimeFeatures,
    Chain,
    InstanceSplitter,
    ValidationSplitSampler
)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.seasonal import STL
from tqdm import tqdm

from src.models.utils import get_model
from src.utils.data_loading import load_features, load_score, load_test_data
from src.utils.evaluation import score_batch
from src.utils.features import decomps_and_features
from src.utils.transformations import manipulate_trend_component, manipulate_seasonal_determination



In [3]:
def load_generated_data(prefix, generated_test_datadir, len_test_data):
    data = [0 for _ in range(len_test_data)]
    for f in os.listdir(generated_test_datadir):
        if f.startswith(prefix):
            file_name = f.split(".")[0]  # slice off .npy from file name
            ts_idx = int(file_name[len(prefix):])  # the remaining charachters after prefix is always the time series id
            data[ts_idx] = np.load(os.path.join(generated_test_datadir, f))

    data = np.array(data)  # [len(original_test), num_manipulations, 4]
    return data


def load_generated_features_and_data(suffix, suffix_prefix_to_fname, suffix_suffix_to_index, original_config, generated_datadir, len_test_data):
    f_suffix = get_file_suffix(suffix, suffix_prefix_to_fname)
    data_prefix = f"ts_{f_suffix}"
    feature_prefix = f"feat_{f_suffix}"
    gen_ts = load_generated_data(data_prefix, generated_datadir, len_test_data)
    gen_features = load_generated_data(feature_prefix, generated_datadir, len_test_data)

    idx = suffix_suffix_to_index[suffix[-3:]]
    gen_ts = gen_ts[:, idx, :].reshape([-1, original_config["context_length"] + original_config["prediction_length"]])
    gen_features = gen_features[:, idx, :].reshape([-1, 4])  # flatten the two first dimensions
    return gen_ts, gen_features


def create_gen_dataloader(generated_data, dataset, context_length, prediction_length, batch_size):
    original_dataset = get_dataset(dataset)
    
    # nbeats doesn't do any covariates so we don't care that the starting points of forecasts are wrong here
    list_data = [{"start": original["start"], "target": generated, "feat_static_cat": original["feat_static_cat"], "item_id": original["item_id"]}
                 for original, generated in zip(itertools.cycle(original_dataset.test), generated_data)]
    generated_dataset = ListDataset(list_data, freq=original_dataset.metadata.freq)
    
    transformation = Chain([
        AddObservedValuesIndicator(
            target_field=FieldName.TARGET,
            output_field=FieldName.OBSERVED_VALUES,
        ),
        AddTimeFeatures(
            start_field=FieldName.START,
            target_field=FieldName.TARGET,
            output_field=FieldName.FEAT_TIME,
            pred_length=prediction_length,
            time_features=[HourOfDay(), DayOfWeek(), DayOfMonth(), DayOfYear(), MonthOfYear()]
        ),
        InstanceSplitter(
            target_field=FieldName.TARGET,
            is_pad_field=FieldName.IS_PAD,
            start_field=FieldName.START,
            forecast_start_field=FieldName.FORECAST_START,
            instance_sampler=ValidationSplitSampler(min_future=prediction_length),
            past_length=context_length,
            future_length=prediction_length,
            time_series_fields=[FieldName.FEAT_TIME, FieldName.OBSERVED_VALUES]
        )
    ])
    dataloader = ValidationDataLoader(
        generated_dataset,
        batch_size=batch_size,
        stack_fn=batchify,
        transform=transformation,
        num_workers=1
    )
    return dataloader


def organize_scores(all_scores, metric):
    mape = []
    smape = []
    mase = []
    seasonal_mase = []
    mse = []
    mae = []
    for score in all_scores:
        mape.append(score[0])
        smape.append(score[1])
        mase.append(score[2])
        seasonal_mase.append(score[3])
        mse.append(score[4])
        mae.append(score[5])


    mape = np.vstack(mape)
    smape = np.vstack(smape)
    mase = np.vstack(mase)
    seasonal_mase = np.vstack(seasonal_mase)
    mse = np.vstack(mse)
    mae = np.vstack(mae)
    
    return {"mape": mape, "smape": smape, "mase": mase, "seasonal_mase": seasonal_mase, "mse": mse, "mae": mae}[metric]


def get_scores(original_model, new_model, original_config, new_config, gen_ts, dataset, metric):

    gen_dataloader = create_gen_dataloader(gen_ts, dataset, original_config["trainer_args"]["context_length"],
                                           original_config["trainer_args"]["prediction_length"], original_config["trainer_args"]["batch_size"])

    original_gen_scores = []
    new_gen_scores = []
    original_model.eval()
    new_model.eval()
    for batch in tqdm(gen_dataloader):
        original_preds = original_model.predict(batch)[:, :, 0]
        new_preds = new_model.predict(batch)[:, :, 0]

        context = batch["past_target"].unsqueeze(dim=-1).numpy()
        target = batch["future_target"].numpy()

        original_gen_scores.append(score_batch(target, original_preds, context, original_config["sp"]))
        new_gen_scores.append(score_batch(target, new_preds, context, new_config["sp"]))

    original_gen_scores = organize_scores(original_gen_scores, metric)
    new_gen_scores = organize_scores(new_gen_scores, metric)
    
    return original_gen_scores, new_gen_scores

In [4]:
def create_instance_space(datadir):
    train_features = load_features(datadir, train=True)
    test_features = load_features(datadir, train=False)
    scaler = StandardScaler()
    norm_train_features = scaler.fit_transform(train_features)
    norm_test_features = scaler.transform(test_features)

    pca = PCA(n_components=2)
    train_pca_data = pca.fit_transform(norm_train_features)
    test_pca_data = pca.transform(norm_test_features)
    return pca, scaler, test_pca_data

In [5]:
def create_color_arr(scores, quantiles=None):
    if quantiles is None:
        quantiles = np.nanquantile(scores, [0.25, 0.75])
    
    low = scores < quantiles[0]
    high = scores > quantiles[1]
    medium = np.logical_and(~low, ~high)
    
    colors = np.empty_like(scores)
    colors[low] = 0
    colors[medium] = 1
    colors[high] = 2
    
    return colors


def color_bin(bins, pca_data, colors, column_name):
    q, r = cartesian_to_axial(pca_data[:, 0], pca_data[:, 1], 0.1, "pointytop")
    df = pd.DataFrame(dict(r=r, q=q))
    groups = df.groupby(["q", "r"])
    
    for (q, r), indexes in groups.groups.items():
        color = np.nanmean(colors[indexes])
        bins.loc[(bins["q"] == q) & (bins["r"] == r), column_name] = color
    
    return bins


def get_fig(title, xrange, yrange):
    p = figure(title=title, tools="", match_aspect=True, x_range=xrange, y_range=yrange)
    p.output_backend = "svg"
    p.title.align = "center"
    p.grid.visible = False
    return p


def create_and_plot_hexbin(original_scores, new_scores, pca_data, figdir, dataset, suffix, model, metric, limits):
    # create a seperate folder for each dataset and model
    figdir = os.path.join(figdir, dataset, model)
    if not os.path.isdir(figdir):
        os.makedirs(figdir, exist_ok=True)
    
    # create hexbins
    bins = hexbin(pca_data[:, 0], pca_data[:, 1], 0.1)
    bins["original_colors"] = np.nan
    bins["new_colors"] = np.nan
    
    orig_quantiles = np.nanquantile(original_scores, [0.25, 0.75])
    original_colors = create_color_arr(original_scores, orig_quantiles)
    new_colors = create_color_arr(new_scores, orig_quantiles)

    bins = color_bin(bins, pca_data, original_colors, "original_colors")
    bins = color_bin(bins, pca_data, new_colors, "new_colors")
    
    # plot original model
    if limits is not None:
        xrange = limits["xrange"]
        yrange = limits["yrange"]
    else:
        xrange = None
        yrange = None
    
    p = get_fig(f"{model} trained with original training data", xrange, yrange)
    p.hex_tile(q="q", r="r", size=0.1, line_color=None, source=bins,
                fill_color=linear_cmap("original_colors", "Viridis256", min(bins.original_colors), max(bins.original_colors)))

    export_svg(p, filename=os.path.join(figdir, f"{dataset}_{suffix}_{model}_orig_{metric}_hexbin.svg"))
    
    # plot new model
    p = get_fig(f"{model} trained with augmented training data", xrange, yrange)
    p.hex_tile(q="q", r="r", size=0.1, line_color=None, source=bins,
                fill_color=linear_cmap("new_colors", "Viridis256", min(bins.new_colors), max(bins.new_colors)))

    export_svg(p, filename=os.path.join(figdir, f"{dataset}_{suffix}_{model}_OOD_{metric}_hexbin.svg"))

In [6]:
def get_file_suffix(suffix, suffix_prefix_to_fname):
    for prefix in suffix_prefix_to_fname.keys():
        if suffix.startswith(prefix):
            return suffix_prefix_to_fname[prefix]

In [7]:
def generate_data(dataset, config):
    test_data = load_test_data(dataset, config["context_length"] + config["prediction_length"])
    
    trend_str_inc_ts = []
    trend_str_dec_ts = []
    trend_lin_inc_ts = []
    trend_lin_dec_ts = []
    trend_slope_inc_ts = []
    trend_slope_dec_ts = []
    seas_str_inc_ts = []
    seas_str_dec_ts = []

    trend_str_inc_feat = []
    trend_str_dec_feat = []
    trend_lin_inc_feat = []
    trend_lin_dec_feat = []
    trend_slope_inc_feat = []
    trend_slope_dec_feat = []
    seas_str_inc_feat = []
    seas_str_dec_feat = []
    for ts in tqdm(test_data):
        decomp = decomps_and_features([ts], config["sp"])[0][0]
        
        inc_str = manipulate_trend_component(decomp.trend, f=100, g=1, h=1, m=0) + decomp.seasonal + decomp.resid
        dec_str = manipulate_trend_component(decomp.trend, f=0.01, g=1, h=1, m=0) + decomp.seasonal + decomp.resid
        
        inc_lin = manipulate_trend_component(decomp.trend, f=1, g=1, h=100, m=0) + decomp.seasonal + decomp.resid
        dec_lin = manipulate_trend_component(decomp.trend, f=1, g=1, h=0.01, m=0) + decomp.seasonal + decomp.resid
        
        inc_slope = manipulate_trend_component(decomp.trend, f=1, g=1, h=1, m=-1) + decomp.seasonal + decomp.resid
        dec_slope = manipulate_trend_component(decomp.trend, f=1, g=1, h=1, m=1) + decomp.seasonal + decomp.resid
        
        generated_ts = [inc_str, dec_str, inc_lin, dec_lin, inc_slope, dec_slope]
        if config["sp"] > 1:
            inc_seas = manipulate_seasonal_determination(decomp.seasonal, k=100) + decomp.trend + decomp.resid
            dec_seas = manipulate_seasonal_determination(decomp.seasonal, k=0.01) + decomp.trend + decomp.resid
            generated_ts.extend([inc_seas, dec_seas])
            
        _, features = decomps_and_features(generated_ts, config["sp"])
        
        trend_str_inc_ts.append(inc_str)
        trend_str_dec_ts.append(dec_str)
        trend_lin_inc_ts.append(inc_lin)
        trend_lin_dec_ts.append(dec_lin)
        trend_slope_inc_ts.append(inc_slope)
        trend_slope_dec_ts.append(dec_slope)
        
        trend_str_inc_feat.append(features[0])
        trend_str_dec_feat.append(features[1])
        trend_lin_inc_feat.append(features[2])
        trend_lin_dec_feat.append(features[3])
        trend_slope_inc_feat.append(features[4])
        trend_slope_dec_feat.append(features[5])
        
        if config["sp"] > 1:
            seas_str_inc_ts.append(inc_seas)
            seas_str_dec_ts.append(dec_seas)
            seas_str_inc_feat.append(features[6])
            seas_str_dec_feat.append(features[7])
    
    trend_str_inc_ts = np.array(trend_str_inc_ts)
    trend_str_dec_ts = np.array(trend_str_dec_ts)
    trend_lin_inc_ts = np.array(trend_lin_inc_ts)
    trend_lin_dec_ts = np.array(trend_lin_dec_ts)
    trend_slope_inc_ts = np.array(trend_slope_inc_ts)
    trend_slope_dec_ts = np.array(trend_slope_dec_ts)
    seas_str_inc_ts = np.array(seas_str_inc_ts)
    seas_str_dec_ts = np.array(seas_str_dec_ts)
    
    trend_str_inc_feat = np.array(trend_str_inc_feat)
    trend_str_dec_feat = np.array(trend_str_dec_feat)
    trend_lin_inc_feat = np.array(trend_lin_inc_feat)
    trend_lin_dec_feat = np.array(trend_lin_dec_feat)
    trend_slope_inc_feat = np.array(trend_slope_inc_feat)
    trend_slope_dec_feat = np.array(trend_slope_dec_feat)
    seas_str_inc_feat = np.array(seas_str_inc_feat)
    seas_str_dec_feat = np.array(seas_str_dec_feat)
    
    ts_dict = {"trend_str_inc": trend_str_inc_ts, "trend_str_dec": trend_str_dec_ts,
               "lin_inc": trend_lin_inc_ts, "lin_dec": trend_lin_dec_ts,
               "slope_inc": trend_slope_inc_ts, "slope_dec": trend_slope_dec_ts,
               "seas_inc": seas_str_inc_ts, "seas_dec": seas_str_dec_ts}
    
    feat_dict = {"trend_str_inc": trend_str_inc_feat, "trend_str_dec": trend_str_dec_feat,
                "lin_inc": trend_lin_inc_feat, "lin_dec": trend_lin_dec_feat,
                "slope_inc": trend_slope_inc_feat, "slope_dec": trend_slope_dec_feat,
                "seas_inc": seas_str_inc_feat, "seas_dec": seas_str_dec_feat}
    
    return ts_dict, feat_dict

In [8]:
metric = "smape"
create_plots = True
models = ["feedforward", "seq2seq", "nbeats_g", "tcn", "transformer"]
model_names = ["Fully-connected", "LSTM", "N-BEATS", "TCN", "Transformer"]

dataset_suffixes = {
    "electricity_nips": ["seas_dec", "slope_dec", "slope_inc"],
    "traffic_nips": ["slope_inc", "trend_str_dec"],  # "seas_dec", "slope_dec", 
    "m4_hourly": ["slope_dec", "slope_inc"],  # "seas_dec"
    "m4_daily": ["seas_inc"],
    "m4_weekly": ["trend_str_dec"],  # "slope_dec", "slope_inc", 
    "m4_monthly": ["slope_inc", "lin_dec", "trend_str_dec"],  # "seas_dec", "slope_dec", 
    "m4_quarterly": ["lin_dec", "slope_dec"],  # "seas_dec", "seas_inc", 
    "m4_yearly": ["lin_dec"]  # "slope_dec", "slope_inc"
}
plot_limits = {
    "electricity_nips": {"slope_dec": {"xrange": [-5, 6], "yrange": [-8, 6]}, "slope_inc": {"xrange": [-3, 8], "yrange": [-3, 8]}},
    "traffic_nips": {"lin_dec": {"xrange": [-4, 5], "yrange": [-4, 5]}},
    "m4_hourly": {"slope_dec": {"xrange": [-10, 5], "yrange": [-12, 6]}},
    "m4_daily": {},
    "m4_weekly": {},
    "m4_monthly": {"lin_dec": {"xrange": [-5, 5], "yrange": [-5, 5]}, "slope_dec": {"xrange": [-5, 8], "yrange": [-5, 8]}, "trend_str_dec": {"xrange": [-5, 5], "yrange": [-5, 8]}},
    "m4_quarterly": {"lin_dec": {"xrange": [-4, 5], "yrange": [-4, 5]}, "slope_dec": {"xrange": [-4, 5], "yrange": [-4, 5]}},
    "m4_yearly": {"slope_dec": {"xrange": [-10, 10], "yrange": [-15, 5]}, "slope_inc": {"xrange": [-10, 10], "yrange": [-5, 15]}, "lin_dec": {"xrange": [-5, 5], "yrange": [-5, 5]}},
}
suffix_prefix_to_fname = {"seas": "seasonal_str", "slope": "trend_slope", "lin": "trend_lin", "trend_str": "trend_str"}
suffix_suffix_to_index = {"inc": 98, "dec": -1}

figdir = "figures/OOD"
if not os.path.isdir(figdir):
    os.makedirs(figdir, exist_ok=True)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
scores_dict = {key: {} for key in dataset_suffixes.keys()}

for dataset in dataset_suffixes.keys():
    print(f"Calculating scores and creating plots for {dataset}...")
    datadir = f"data/{dataset}"
    generated_datadir = os.path.join(f"/datadrive2/whatif/{dataset}", "generated", "test")
    
    #create instance space
    pca, scaler, test_pca_data = create_instance_space(datadir)
    
    # load the config and score of some random model to get metadata
    original_experiment_dir = f"experiments/{dataset}/nbeats_g"
    with open(os.path.join(original_experiment_dir, "config.yaml"), "r") as f:
        original_config = yaml.load(f, Loader=yaml.FullLoader)
        
    original_scores = load_score(original_experiment_dir, metric)
    
    if dataset in ["m4_daily", "m4_weekly", "m4_yearly"]:
        generated_data, generated_features = generate_data(dataset, original_config)
    
    for suffix in dataset_suffixes[dataset]:
        # load data and features
        if suffix == "all":
            gen_ts = []
            gen_features = []
            for suffix in dataset_suffixes[dataset]:
                if suffix == "all":
                    continue
                
                if dataset in ["m4_daily", "m4_weekly", "m4_yearly"]:
                    gen_ts_suffix = generated_data[suffix]
                    gen_features_suffix = generated_features[suffix]
                else:
                    gen_ts_suffix, gen_features_suffix = load_generated_features_and_data(suffix, suffix_prefix_to_fname, suffix_suffix_to_index,
                                                                                          original_config, generated_datadir, original_scores.shape[0])
                gen_ts.append(gen_ts_suffix)
                gen_features.append(gen_features_suffix)

            gen_ts = np.vstack(gen_ts)
            gen_features = np.vstack(gen_features)
        else:
            if dataset in ["m4_daily", "m4_weekly", "m4_yearly"]:
                gen_ts = generated_data[suffix]
                gen_features = generated_features[suffix]
            else:
                gen_ts, gen_features = load_generated_features_and_data(suffix, suffix_prefix_to_fname, suffix_suffix_to_index,
                                                                        original_config, generated_datadir, original_scores.shape[0])

        for model, name in zip(models, model_names):
            # load original model
            original_experiment_dir = f"experiments/{dataset}/{model}"
            with open(os.path.join(original_experiment_dir, "config.yaml"), "r") as f:
                original_config = yaml.load(f, Loader=yaml.FullLoader)

            original_model = get_model(original_config["model_name"])(**original_config["model_args"], device=device, path=original_config["path"]).to(device)
            original_model.load_state_dict(torch.load(os.path.join(original_config["path"], "model.pth")))
            original_scores = load_score(original_experiment_dir, metric)
        
            # load new model
            new_experiment_dir = f"experiments/{dataset}/{model}_gen_{suffix}"
            with open(os.path.join(new_experiment_dir, "config.yaml"), "r") as f:
                new_config = yaml.load(f, Loader=yaml.FullLoader)
                
            new_model = get_model(new_config["model_name"])(**new_config["model_args"], device=device, path=new_config["path"]).to(device)
            new_model.load_state_dict(torch.load(os.path.join(new_config["path"], "model.pth")))
            new_scores = load_score(new_experiment_dir, metric)
            
            # evaluate models on OOD test data
            original_gen_scores, new_gen_scores = get_scores(original_model, new_model, original_config, new_config, gen_ts, dataset, metric)
            
            if name not in scores_dict[dataset].keys():
                scores_dict[dataset][name] = {suffix: {"original model": {"orig": np.nanmean(original_scores), "ood": np.nanmean(original_gen_scores)},
                                                       "new model": {"orig": np.nanmean(new_scores), "ood": np.nanmean(new_gen_scores)}}}
            else:
                scores_dict[dataset][name][suffix] = {"original model": {"orig": np.nanmean(original_scores), "ood": np.nanmean(original_gen_scores)},
                                                      "new model": {"orig": np.nanmean(new_scores), "ood": np.nanmean(new_gen_scores)}}
            
            # concatenate scores on original test data and ood test, and calculate mean per time series
            original_scores_concat = np.concatenate([original_scores, original_gen_scores], axis=0)
            new_scores_concat = np.concatenate([new_scores, new_gen_scores], axis=0)
            original_ts_scores = np.nanmean(original_scores_concat, axis=-1)
            new_ts_scores = np.nanmean(new_scores_concat, axis=-1)
            
            # transform generated features to instance space and concatenate with original test data
            norm_gen_features = scaler.transform(gen_features)
            gen_pca_data = pca.transform(norm_gen_features)
            concatenated_pca_data = np.concatenate([test_pca_data, gen_pca_data], axis=0)
            
            if create_plots:
                limits = plot_limits[dataset].get(suffix)
                create_and_plot_hexbin(original_ts_scores, new_ts_scores, concatenated_pca_data, figdir, dataset, suffix, name, metric, limits)

Calculating scores and creating plots for electricity_nips...


6it [00:01,  3.94it/s]
6it [00:02,  2.92it/s]
6it [00:01,  3.07it/s]
6it [00:03,  1.60it/s]
6it [00:02,  2.05it/s]
6it [00:01,  4.38it/s]
6it [00:02,  2.92it/s]
6it [00:01,  3.17it/s]
6it [00:03,  1.60it/s]
6it [00:02,  2.02it/s]
6it [00:01,  4.26it/s]
6it [00:02,  2.89it/s]
6it [00:01,  3.17it/s]
6it [00:03,  1.58it/s]
6it [00:02,  2.04it/s]


Calculating scores and creating plots for traffic_nips...


14it [00:03,  3.85it/s]
14it [00:05,  2.67it/s]
14it [00:04,  2.85it/s]
14it [00:09,  1.50it/s]
14it [00:07,  1.79it/s]
14it [00:03,  3.88it/s]
14it [00:04,  2.92it/s]
14it [00:04,  2.83it/s]
14it [00:09,  1.47it/s]
14it [00:07,  1.81it/s]


Calculating scores and creating plots for m4_hourly...


1it [00:00,  5.41it/s]
1it [00:00,  3.40it/s]
1it [00:00,  3.85it/s]
1it [00:00,  1.55it/s]
1it [00:00,  2.86it/s]
1it [00:00,  5.74it/s]
1it [00:00,  3.85it/s]
1it [00:00,  4.27it/s]
1it [00:00,  1.85it/s]
1it [00:00,  2.98it/s]
  6%|▌         | 243/4227 [00:00<00:01, 2426.24it/s]

Calculating scores and creating plots for m4_daily...
Loading test data


100%|██████████| 4227/4227 [00:02<00:00, 1464.68it/s]
100%|██████████| 4227/4227 [03:16<00:00, 21.48it/s]
9it [00:00,  9.14it/s]
9it [00:01,  6.99it/s]
9it [00:01,  5.18it/s]
9it [00:01,  5.66it/s]
9it [00:01,  4.88it/s]
 33%|███▎      | 119/359 [00:00<00:00, 1187.09it/s]

Calculating scores and creating plots for m4_weekly...
Loading test data


100%|██████████| 359/359 [00:00<00:00, 1261.42it/s]
100%|██████████| 359/359 [00:11<00:00, 31.24it/s]
1it [00:00,  8.54it/s]
1it [00:00,  7.24it/s]
1it [00:00,  5.38it/s]
1it [00:00,  6.30it/s]
1it [00:00,  6.03it/s]


Calculating scores and creating plots for m4_monthly...


94it [00:12,  7.54it/s]
94it [00:16,  5.64it/s]
94it [00:20,  4.61it/s]
94it [00:21,  4.36it/s]
94it [00:22,  4.25it/s]
94it [00:12,  7.67it/s]
94it [00:16,  5.64it/s]
94it [00:20,  4.57it/s]
94it [00:21,  4.38it/s]
94it [00:21,  4.28it/s]
94it [00:12,  7.75it/s]
94it [00:16,  5.65it/s]
94it [00:20,  4.58it/s]
94it [00:21,  4.37it/s]
94it [00:21,  4.32it/s]


Calculating scores and creating plots for m4_quarterly...


47it [00:04, 10.56it/s]
47it [00:05,  8.19it/s]
47it [00:08,  5.32it/s]
47it [00:06,  7.27it/s]
47it [00:07,  6.60it/s]
47it [00:04, 10.31it/s]
47it [00:06,  7.75it/s]
47it [00:08,  5.30it/s]
47it [00:06,  7.60it/s]
47it [00:07,  6.11it/s]
  1%|          | 250/23000 [00:00<00:09, 2495.17it/s]

Calculating scores and creating plots for m4_yearly...
Loading test data


100%|██████████| 23000/23000 [00:09<00:00, 2315.77it/s]
100%|██████████| 23000/23000 [12:25<00:00, 30.86it/s]
45it [00:05,  8.57it/s]
45it [00:06,  7.25it/s]
45it [00:09,  4.65it/s]
45it [00:06,  6.88it/s]
45it [00:07,  6.06it/s]


In [10]:
suffix_to_transformation = {"trend_str_inc": " (f=100)", "trend_str_dec": " (f=0.01)",
                            "slope_inc": " (m=1)", "slope_dec": " (m=-1)",
                            "lin_inc": " (h=100)", "lin_dec": " (h=0.01)",
                            "seas_inc": " (k=100)", "seas_dec": " (k=0.01)"}

df = pd.DataFrame()
for dataset in scores_dict.keys():
    for suffix in dataset_suffixes[dataset]:
        # create a multi index for each transformation in the dataset, with one row for original data and one row for generated data
        row_name = dataset + suffix_to_transformation[suffix]
        tuples = [(row_name, "original test set"), (row_name, ("generated test set"))]
        index = pd.MultiIndex.from_tuples(tuples)
        
        columns = []
        old_original_scores = []
        new_original_scores = []
        old_generated_scores = []
        new_generated_scores = []
        for model in scores_dict[dataset]:
            old_model_col = (model, "old")
            new_model_col =  (model, "augmented")
            columns.append(old_model_col)
            columns.append(new_model_col)
            
            old_original_scores.append(scores_dict[dataset][model][suffix]["original model"]["orig"])
            old_generated_scores.append(scores_dict[dataset][model][suffix]["original model"]["ood"])
            new_original_scores.append(scores_dict[dataset][model][suffix]["new model"]["orig"])
            new_generated_scores.append(scores_dict[dataset][model][suffix]["new model"]["ood"])
        
        
        zipped_original = []
        for old, new in zip(old_original_scores, new_original_scores):
            zipped_original.append(np.round(old, 3))
            
            percentage = np.round((np.abs(old - new) / old) * 100, 3)
            percentage_str = "+" + str(percentage) if new >= old else "-" + str(percentage)
            new = f"{np.round(new, 3)} ({percentage_str}%)"
            zipped_original.append(new)
            
            
        zipped_generated = []
        for old, new in zip(old_generated_scores, new_generated_scores):
            zipped_generated.append(np.round(old, 3))
            
            percentage = np.round((np.abs(old - new) / old) * 100, 3)
            percentage_str = "+" + str(percentage) if new >= old else "-" + str(percentage)
            new = f"{np.round(new, 3)} ({percentage_str}%)"
            zipped_generated.append(new)
        
        suffix_df = pd.DataFrame(np.vstack([zipped_original, zipped_generated]), columns=columns)
        suffix_df.index = index
        suffix_df.columns = pd.MultiIndex.from_tuples(suffix_df.columns)
        df = pd.concat([df, suffix_df])

In [11]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Fully-connected,Fully-connected,LSTM,LSTM,N-BEATS,N-BEATS,TCN,TCN,Transformer,Transformer
Unnamed: 0_level_1,Unnamed: 1_level_1,old,augmented,old,augmented,old,augmented,old,augmented,old,augmented
electricity_nips (k=0.01),original test set,0.121,0.138 (+14.624%),0.116,0.121 (+4.182%),0.096,0.099 (+3.003%),0.115,0.14 (+21.752%),0.111,0.117 (+5.53%)
electricity_nips (k=0.01),generated test set,0.087,0.069 (-20.622%),0.109,0.097 (-11.105%),0.086,0.062 (-27.453%),0.095,0.084 (-10.907%),0.095,0.079 (-16.837%)
electricity_nips (m=-1),original test set,0.121,0.112 (-7.42%),0.116,0.115 (-1.041%),0.096,0.097 (+1.202%),0.115,0.122 (+5.976%),0.111,0.114 (+2.886%)
electricity_nips (m=-1),generated test set,1.372,0.535 (-60.974%),1.392,1.079 (-22.473%),0.823,0.522 (-36.667%),1.183,0.695 (-41.278%),1.276,0.617 (-51.666%)
electricity_nips (m=1),original test set,0.121,0.109 (-9.738%),0.116,0.122 (+4.889%),0.096,0.097 (+1.374%),0.115,0.127 (+10.712%),0.111,0.12 (+8.19%)
electricity_nips (m=1),generated test set,0.194,0.053 (-72.449%),0.226,0.109 (-51.589%),0.123,0.056 (-55.038%),0.203,0.08 (-60.759%),0.167,0.081 (-51.553%)
traffic_nips (m=1),original test set,0.168,0.173 (+2.69%),0.186,0.201 (+8.111%),0.089,0.095 (+6.914%),0.171,0.224 (+31.106%),0.207,0.563 (+172.249%)
traffic_nips (m=1),generated test set,0.156,0.067 (-57.065%),0.255,0.204 (-19.94%),0.123,0.055 (-55.549%),0.327,0.147 (-55.149%),0.409,0.287 (-29.751%)
traffic_nips (f=0.01),original test set,0.168,0.159 (-5.34%),0.186,0.173 (-6.947%),0.089,0.095 (+6.851%),0.171,0.398 (+132.73%),0.207,0.555 (+168.215%)
traffic_nips (f=0.01),generated test set,0.257,0.239 (-6.942%),0.524,0.432 (-17.654%),0.225,0.219 (-2.309%),0.35,0.337 (-3.588%),0.642,0.55 (-14.414%)


In [12]:
print(df.to_latex(multirow=True, column_format="|ll|cc|cc|cc|cc|cc|", multicolumn_format="c"))

\begin{tabular}{|ll|cc|cc|cc|cc|cc|}
\toprule
                   &                    & \multicolumn{2}{c}{Fully-connected} & \multicolumn{2}{c}{LSTM} & \multicolumn{2}{c}{N-BEATS} & \multicolumn{2}{c}{TCN} & \multicolumn{2}{c}{Transformer} \\
                   &                    &             old &         augmented &    old &         augmented &     old &         augmented &    old &         augmented &         old &          augmented \\
\midrule
\multirow{2}{*}{electricity\_nips (k=0.01)} & original test set &           0.121 &  0.138 (+14.624\%) &  0.116 &   0.121 (+4.182\%) &   0.096 &   0.099 (+3.003\%) &  0.115 &   0.14 (+21.752\%) &       0.111 &     0.117 (+5.53\%) \\
                   & generated test set &           0.087 &  0.069 (-20.622\%) &  0.109 &  0.097 (-11.105\%) &   0.086 &  0.062 (-27.453\%) &  0.095 &  0.084 (-10.907\%) &       0.095 &   0.079 (-16.837\%) \\
\cline{1-12}
\multirow{2}{*}{electricity\_nips (m=-1)} & original test set &           0.121 &    0.1

In [13]:
average_changes_per_transform = {dataset: {} for dataset in dataset_suffixes.keys()}
std_changes_per_transform = {dataset: {} for dataset in dataset_suffixes.keys()}
median_changes_per_transform = {dataset: {} for dataset in dataset_suffixes.keys()}

old_degen_average_changes_per_transform = {dataset: {} for dataset in dataset_suffixes.keys()}
old_degen_std_changes_per_transform = {dataset: {} for dataset in dataset_suffixes.keys()}
old_degen_median_changes_per_transform = {dataset: {} for dataset in dataset_suffixes.keys()}

per_model_percentages = {model_name: {"original": [], "generated": [], "ood_degen": []} for model_name in model_names}

all_old = []
all_generated = []
all_ood_degen = []
for dataset in scores_dict.keys():
    for suffix in dataset_suffixes[dataset]:
        old_percentages = []
        generated_percentages = []
        ood_degen_percentages = []
        for model in scores_dict[dataset]:
            old_original_score = scores_dict[dataset][model][suffix]["original model"]["orig"]
            old_generated_score = scores_dict[dataset][model][suffix]["original model"]["ood"]
            new_original_score = scores_dict[dataset][model][suffix]["new model"]["orig"]
            new_generated_score = scores_dict[dataset][model][suffix]["new model"]["ood"]
            
            old_percentage = (np.abs(old_original_score - new_original_score) / old_original_score) * 100
            generated_percentage = (np.abs(old_generated_score - new_generated_score) / old_generated_score) * 100
            ood_degeneration_percentage = (np.abs(old_original_score - old_generated_score) / old_original_score) * 100
            
            old_percentage = old_percentage if new_original_score >= old_original_score else -old_percentage
            generated_percentage = generated_percentage if new_generated_score >= old_generated_score else -generated_percentage
            ood_degeneration_percentage = ood_degeneration_percentage if old_generated_score >= old_original_score else -ood_degeneration_percentage
            
            old_percentages.append(old_percentage)
            generated_percentages.append(generated_percentage)
            ood_degen_percentages.append(ood_degeneration_percentage)
            
            per_model_percentages[model]["original"].append(old_percentage)
            per_model_percentages[model]["generated"].append(generated_percentage)
            per_model_percentages[model]["ood_degen"].append(ood_degeneration_percentage)
            
            all_old.append(old_percentage)
            all_generated.append(generated_percentage)
            all_ood_degen.append(ood_degeneration_percentage)

        average_changes_per_transform[dataset][suffix] = {"original test set": np.mean(old_percentages), "generated test set": np.mean(generated_percentages)}
        std_changes_per_transform[dataset][suffix] = {"original test set": np.std(old_percentages), "generated test set": np.std(generated_percentages)}
        median_changes_per_transform[dataset][suffix] = {"original test set": np.median(old_percentages), "generated test set": np.median(generated_percentages)}
        
        old_degen_average_changes_per_transform[dataset][suffix] = np.mean(ood_degen_percentages)
        old_degen_std_changes_per_transform[dataset][suffix] = np.std(ood_degen_percentages)
        old_degen_median_changes_per_transform[dataset][suffix] = np.median(ood_degen_percentages)


average_per_model_percentages = {model_name: {"original": [], "generated": [], "ood": []} for model_name in model_names}
std_per_model_percentages = {model_name: {"original": [], "generated": [], "ood": []} for model_name in model_names}
median_per_model_percentages = {model_name: {"original": [], "generated": [], "ood": []} for model_name in model_names} 
for model in per_model_percentages:
    average_per_model_percentages[model]["original"] = np.mean(per_model_percentages[model]["original"])
    average_per_model_percentages[model]["generated"] = np.mean(per_model_percentages[model]["generated"])
    average_per_model_percentages[model]["ood"] = np.mean(per_model_percentages[model]["ood_degen"])
    
    std_per_model_percentages[model]["original"] = np.std(per_model_percentages[model]["original"])
    std_per_model_percentages[model]["generated"] = np.std(per_model_percentages[model]["generated"])
    std_per_model_percentages[model]["ood"] = np.mean(per_model_percentages[model]["ood_degen"])
    
    median_per_model_percentages[model]["original"] = np.median(per_model_percentages[model]["original"])
    median_per_model_percentages[model]["generated"] = np.median(per_model_percentages[model]["generated"])
    median_per_model_percentages[model]["ood"] = np.mean(per_model_percentages[model]["ood_degen"])

# Differences between the augmented and the old models

In [14]:
for dataset in average_changes_per_transform:
    for suffix in average_changes_per_transform[dataset]:
        print(dataset)
        print(f"\t{suffix_to_transformation[suffix]}")
        print(f"\t\tAverage percentage change on original test set:  {np.round(average_changes_per_transform[dataset][suffix]['original test set'], 3)}")
        print(f"\t\tAverage percentage change on generated test set: {np.round(average_changes_per_transform[dataset][suffix]['generated test set'], 3)}")
        print()
        print(f"\t\tStandard deviation of percentage change on original test set:  {np.round(std_changes_per_transform[dataset][suffix]['original test set'], 3)}")
        print(f"\t\tStandard deviation of percentage change on generated test set: {np.round(std_changes_per_transform[dataset][suffix]['generated test set'], 3)}")
        print()
        print(f"\t\tMedian percentage change on original test set:  {np.round(median_changes_per_transform[dataset][suffix]['original test set'], 3)}")
        print(f"\t\tMedian percentage change on generated test set: {np.round(median_changes_per_transform[dataset][suffix]['generated test set'], 3)}")

electricity_nips
	 (k=0.01)
		Average percentage change on original test set:  9.818
		Average percentage change on generated test set: -17.385

		Standard deviation of percentage change on original test set:  7.24
		Standard deviation of percentage change on generated test set: 6.222

		Median percentage change on original test set:  5.53
		Median percentage change on generated test set: -16.837
electricity_nips
	 (m=-1)
		Average percentage change on original test set:  0.321
		Average percentage change on generated test set: -42.612

		Standard deviation of percentage change on original test set:  4.497
		Standard deviation of percentage change on generated test set: 13.129

		Median percentage change on original test set:  1.202
		Median percentage change on generated test set: -41.278
electricity_nips
	 (m=1)
		Average percentage change on original test set:  3.085
		Average percentage change on generated test set: -58.277

		Standard deviation of percentage change on original tes

In [15]:
for model in average_per_model_percentages:
    print(model)
    print(f"\tAverage percentage change on original test set:  {np.round(average_per_model_percentages[model]['original'], 3)}")
    print(f"\tAverage percentage change on generated test set: {np.round(average_per_model_percentages[model]['generated'], 3)}")
    print()
    print(f"\tStandard deviation of percentage change on original test set:  {np.round(std_per_model_percentages[model]['original'], 3)}")
    print(f"\tStandard deviation of percentage change on generated test set: {np.round(std_per_model_percentages[model]['generated'], 3)}")
    print()
    print(f"\tMedian percentage change on original test set:  {np.round(median_per_model_percentages[model]['original'], 3)}")
    print(f"\tMedian percentage change on generated test set: {np.round(median_per_model_percentages[model]['generated'], 3)}")

Fully-connected
	Average percentage change on original test set:  3.43
	Average percentage change on generated test set: -39.286

	Standard deviation of percentage change on original test set:  7.139
	Standard deviation of percentage change on generated test set: 23.687

	Median percentage change on original test set:  2.69
	Median percentage change on generated test set: -42.231
LSTM
	Average percentage change on original test set:  4.169
	Average percentage change on generated test set: -31.866

	Standard deviation of percentage change on original test set:  6.53
	Standard deviation of percentage change on generated test set: 21.127

	Median percentage change on original test set:  4.182
	Median percentage change on generated test set: -22.473
N-BEATS
	Average percentage change on original test set:  5.76
	Average percentage change on generated test set: -38.025

	Standard deviation of percentage change on original test set:  7.588
	Standard deviation of percentage change on generate

In [16]:
print(f"Average percentage change on original test set:  {np.round(np.mean(all_old), 3)}")
print(f"Average percentage change on generated test set: {np.round(np.mean(all_generated), 3)}")
print()
print(f"Standard deviation of percentage change on original test set:  {np.round(np.std(all_old), 3)}")
print(f"Standard deviation of percentage change on generated test set: {np.round(np.std(all_generated), 3)}")
print()
print(f"Median percentage change on original test set:  {np.round(np.median(all_old), 3)}")
print(f"Median percentage change on generated test set: {np.round(np.median(all_generated), 3)}")

Average percentage change on original test set:  13.839
Average percentage change on generated test set: -35.475

Standard deviation of percentage change on original test set:  32.255
Standard deviation of percentage change on generated test set: 21.513

Median percentage change on original test set:  5.976
Median percentage change on generated test set: -29.751


# How much did the performance of the original models degenerate when faced with OOD data?

In [17]:
for dataset in average_changes_per_transform:
    for suffix in average_changes_per_transform[dataset]:
        print(dataset)
        print(f"\t{suffix_to_transformation[suffix]}")
        print(f"\t\tAverage OOD percentage change:  {np.round(old_degen_average_changes_per_transform[dataset][suffix], 3)}")
        print(f"\t\tStandard deviation of OOD percentage change:  {np.round(old_degen_std_changes_per_transform[dataset][suffix], 3)}")
        print(f"\t\tMedian OOD percentage change:  {np.round(old_degen_median_changes_per_transform[dataset][suffix], 3)}")

electricity_nips
	 (k=0.01)
		Average OOD percentage change:  -15.425
		Standard deviation of OOD percentage change:  7.205
		Median OOD percentage change:  -14.879
electricity_nips
	 (m=-1)
		Average OOD percentage change:  974.4
		Standard deviation of OOD percentage change:  122.014
		Median OOD percentage change:  1036.313
electricity_nips
	 (m=1)
		Average OOD percentage change:  62.216
		Standard deviation of OOD percentage change:  22.779
		Median OOD percentage change:  60.757
traffic_nips
	 (m=1)
		Average OOD percentage change:  51.612
		Standard deviation of OOD percentage change:  38.671
		Median OOD percentage change:  39.193
traffic_nips
	 (f=0.01)
		Average OOD percentage change:  140.633
		Standard deviation of OOD percentage change:  56.104
		Median OOD percentage change:  153.605
m4_hourly
	 (m=-1)
		Average OOD percentage change:  1008.454
		Standard deviation of OOD percentage change:  190.645
		Median OOD percentage change:  947.977
m4_hourly
	 (m=1)
		Average OOD 

In [18]:
for model in average_per_model_percentages:
    print(model)
    print(f"\tAverage percentage change on OOD:  {np.round(average_per_model_percentages[model]['ood'], 3)}")
    print(f"\tStandard deviation of percentage change on OOD:  {np.round(std_per_model_percentages[model]['ood'], 3)}")
    print(f"\tMedian percentage change on OOD:  {np.round(median_per_model_percentages[model]['ood'], 3)}")

Fully-connected
	Average percentage change on OOD:  517.243
	Standard deviation of percentage change on OOD:  517.243
	Median percentage change on OOD:  517.243
LSTM
	Average percentage change on OOD:  545.131
	Standard deviation of percentage change on OOD:  545.131
	Median percentage change on OOD:  545.131
N-BEATS
	Average percentage change on OOD:  570.03
	Standard deviation of percentage change on OOD:  570.03
	Median percentage change on OOD:  570.03
TCN
	Average percentage change on OOD:  541.245
	Standard deviation of percentage change on OOD:  541.245
	Median percentage change on OOD:  541.245
Transformer
	Average percentage change on OOD:  529.351
	Standard deviation of percentage change on OOD:  529.351
	Median percentage change on OOD:  529.351


In [19]:
print(f"Average percentage change on OOD: {np.round(np.mean(all_ood_degen), 3)}")
print(f"Standard deviation of percentage change on OOD: {np.round(np.std(all_ood_degen), 3)}")
print(f"Median percentage change on OOD: {np.round(np.median(all_ood_degen), 3)}")

Average percentage change on OOD: 540.6
Standard deviation of percentage change on OOD: 609.38
Median percentage change on OOD: 181.734
