In [None]:
# IMPORTS
import os
import pickle
import random
import sys
import warnings

from pathlib import Path
from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from river import compose
from river import time_series
from river.metrics.custom import (
    MedAE,
    SAE,
    MeanRollingSAE,
    MeanRollingMAE,
    MeanRollingMedAE,
)

sys.path.insert(1, str(Path().resolve().parent))
from functions.compose import build_model, convert_to_nested_dict  # noqa: E402
from functions.evaluate import build_fit_evaluate, progressive_val_predict  # noqa: E402

sys.path.insert(
    1, str(Path().resolve().parent / "publications/ilustrate/pc2023")
)
from plot_matplotlib import set_size, locator, formatter  # noqa: E402

# CONSTANTS
RANDOM_STATE = 42
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)


# FUNCTIONS
def get_time_features(x):
    # time = x.pop("time")
    # ordinal = time.toordinal()
    # hour_distances = {
    #     hour: math.exp(-((time.hour - hour) ** 2)) for hour in range(1, 25)
    # }
    return {}


def save_model(model, path):
    os.makedirs(path, exist_ok=True)
    with open(f"{path}/{alg[0]}.pkl", "wb") as f:
        pickle.dump(model, f)


def save_results_y(df_ys, path):
    os.makedirs(path, exist_ok=True)
    df_ys.to_csv(f"{path}/ys.csv", index=False)


# DETECTION ALGORITHMS
detection_algorithms = [
    (
        "AR",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX, d=0, q=0),
        ],
        {"SNARIMAX__p__round": (1, 100)},
    ),
    (
        "MA",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX, d=0, p=0),
        ],
        {"SNARIMAX__q__round": (1, 100)},
    ),
    (
        "ARI",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX, q=0),
        ],
        {
            "SNARIMAX__p__round": (1, 100),
            "SNARIMAX__d__round": (1, 100),
        },
    ),
    (
        "IMA",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX, p=0),
        ],
        {
            "SNARIMAX__d__round": (1, 100),
            "SNARIMAX__q__round": (1, 100),
        },
    ),
    (
        "ARIMA",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX),
        ],
        {
            "SNARIMAX__p__round": (1, 100),
            "SNARIMAX__d__round": (0, 100),
            "SNARIMAX__q__round": (1, 100),
        },
    ),
    (
        "SNARIMAX",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX),
        ],
        {
            "SNARIMAX__p__round": (0, 10),
            "SNARIMAX__d__round": (0, 10),
            "SNARIMAX__q__round": (0, 10),
            "SNARIMAX__m__round": (1, 10080),
            "SNARIMAX__sp__round": (0, 10),
            "SNARIMAX__sd__round": (0, 10),
            "SNARIMAX__sq__round": (0, 10),
        },
    ),
]


# DATASETS
df = pd.read_csv("data/data_BESS_norm.csv", index_col=0)
df.index = pd.to_datetime(df.index, utc=True)
df = df.rename(columns={"Avg. Cell Temperature": "is_anomaly"})

dataset = {
    "name": "terra",
    "data": df,
    "anomaly_col": "is_anomaly",
    "drop": None,
}

fig, axs = plt.subplots(
    3,
    2,
    figsize=set_size("thesis", subplots=(3 / 2.3, 2 / 2.3)),
    sharex=True,
    sharey=True,
)
axs = axs.flatten()
# RUN
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # PREPROCESS DATA
    df = dataset["data"]
    if isinstance(dataset["anomaly_col"], str):
        df = df.rename(columns={dataset["anomaly_col"]: "anomaly"})
    elif isinstance(dataset["anomaly_col"], pd.Series):
        df_y = dataset["anomaly_col"]
        df["anomaly"] = df_y.rename("anomaly").values
    if dataset["drop"] is not None:
        df = df.drop(columns=dataset["drop"])
    print(
        f"\n=== {dataset['name']} === [{sum(df['anomaly'])}/{len(df)}]".ljust(
            80, "="
        )
    )

    df_ys = df[["anomaly"]].copy()
    # RUN EACH MODEL AGAINST DATASET
    for alg_no, alg in enumerate(detection_algorithms):
        print(f"\n===== {alg[0]}".ljust(80, "="))
        # INITIALIZE OPTIMIZER
        mod_fun = partial(
            build_fit_evaluate,
            alg[1],
            df,
            metric=MeanRollingSAE(),
        )

        # INITIALIZE METRICS
        metrics_list = []

        # TUNE HYPERPARAMETERS
        optimizer = BayesianOptimization(
            f=mod_fun,
            pbounds=alg[2],
            verbose=2,
            random_state=RANDOM_STATE,
            allow_duplicate_points=True,
        )
        logger = JSONLogger(path=f"./.results/{dataset['name']}-{alg[0]}.log")
        optimizer.subscribe(Events.OPTIMIZATION_END, logger)
        optimizer.maximize()  # Best - 25 init points
        params = convert_to_nested_dict(optimizer.max["params"])
        print(params)
        model = build_model(alg[1], params)
        if hasattr(model, "seed"):
            model.seed = RANDOM_STATE  # type: ignore
        if hasattr(model, "random_state"):
            model.random_state = RANDOM_STATE  # type: ignore
        # USE TUNED MODEL
        # PROGRESSIVE PREDICT
        metrics = [MedAE(), MeanRollingSAE()]
        y_pred, _ = progressive_val_predict(model, df, metrics=metrics)

        # LOAD RESULTS
        axs[alg_no].plot(df_ys.resample("1t").asfreq().anomaly)
        axs[alg_no].plot(
            pd.DataFrame(y_pred, index=df.index).resample("1t").asfreq()
        )
        axs[alg_no].set_ylim(0, 1)
        axs[alg_no].set_title(
            f"{alg[0]} ({metrics[0].__class__.__name__}: {metrics[0].get():.2f})\n"
            f"{', '.join([f'{k}: {v}' for k,v in params['SNARIMAX'].items()])}"
        )
        axs[alg_no].xaxis.set_major_locator(locator)
        axs[alg_no].xaxis.set_major_formatter(formatter)
        axs[alg_no].tick_params(axis="x", labelrotation=50, labelsize=8)

fig.tight_layout()
fig.savefig("ARIMA_opt_results.pdf")

# Show frozen best models

In [None]:
# IMPORTS
import sys
import warnings

from pathlib import Path
from functools import partial

import matplotlib.pyplot as plt
import pandas as pd

from river import compose
from river.metrics import MAE

sys.path.insert(1, str(Path().resolve().parent))
from functions.compose import build_model, convert_to_nested_dict  # noqa: E402
from functions.evaluate import progressive_val_predict  # noqa: E402

sys.path.insert(
    1, str(Path().resolve().parent / "publications/ilustrate/pc2023")
)
from plot_matplotlib import set_size, locator, formatter  # noqa: E402


# DETECTION ALGORITHMS
detection_algorithms = [
    (
        "AR",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX, d=0, q=0),
        ],
        {"SNARIMAX__p__round": 2},
    ),
    (
        "MA",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX, d=0, p=0),
        ],
        {"SNARIMAX__q__round": 1},
    ),
    (
        "ARI",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX, q=0),
        ],
        {
            "SNARIMAX__p__round": 1,
            "SNARIMAX__d__round": 1,
        },
    ),
    (
        "IMA",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX, p=0),
        ],
        {
            "SNARIMAX__d__round": 3,
            "SNARIMAX__q__round": 1,
        },
    ),
    (
        "ARIMA",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX),
        ],
        {
            "SNARIMAX__p__round": 1,
            "SNARIMAX__d__round": 3,
            "SNARIMAX__q__round": 8,
        },
    ),
    (
        "SNARIMAX",
        [
            partial(compose.FuncTransformer, func=get_time_features),
            partial(time_series.SNARIMAX),
        ],
        {
            "SNARIMAX__p__round": 1,
            "SNARIMAX__d__round": 0,
            "SNARIMAX__q__round": 0,
            "SNARIMAX__m__round": 1440,
            "SNARIMAX__sp__round": 2,
            "SNARIMAX__sd__round": 2,
            "SNARIMAX__sq__round": 0,
        },
    ),
]


# DATASETS
dataset = {
    "name": "terra",
    "data": df,
    "anomaly_col": "is_anomaly",
    "drop": None,
}

fig, axs = plt.subplots(
    3,
    2,
    figsize=set_size("thesis", subplots=(3 / 2.3, 2 / 2.3)),
    sharex=True,
    sharey=True,
)
axs = axs.flatten()
# RUN
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # PREPROCESS DATA
    df = dataset["data"]
    # df.index = pd.to_timedelta(
    #     range(0, len(df)), "T"
    # ) + pd.Timestamp.utcnow().replace(microsecond=0)
    if isinstance(dataset["anomaly_col"], str):
        df = df.rename(columns={dataset["anomaly_col"]: "anomaly"})
    elif isinstance(dataset["anomaly_col"], pd.Series):
        df_y = dataset["anomaly_col"]
        df["anomaly"] = df_y.rename("anomaly").values
    if dataset["drop"] is not None:
        df = df.drop(columns=dataset["drop"])
    print(
        f"\n=== {dataset['name']} === [{sum(df['anomaly'])}/{len(df)}]".ljust(
            80, "="
        )
    )

    df_ys = df[["anomaly"]].copy()
    # RUN EACH MODEL AGAINST DATASET
    for alg_no, alg in enumerate(detection_algorithms):
        print(f"\n===== {alg[0]}".ljust(80, "="))
        # INITIALIZE OPTIMIZER
        params = convert_to_nested_dict(alg[2])
        print(params)
        model = build_model(alg[1], params)
        if hasattr(model, "seed"):
            model.seed = RANDOM_STATE  # type: ignore
        if hasattr(model, "random_state"):
            model.random_state = RANDOM_STATE  # type: ignore
        # USE TUNED MODEL
        # PROGRESSIVE PREDICT
        metrics = [
            MedAE(),
            MAE(),
            MeanRollingMAE(),
            MeanRollingMedAE(),
            SAE(),
            MeanRollingSAE(),
        ]
        y_pred, _ = progressive_val_predict(
            model, df, metrics=metrics, **{"period": 5}
        )

        # LOAD RESULTS
        axs[alg_no].plot(df_ys.resample("1t").asfreq().anomaly, linewidth=0.7)
        axs[alg_no].plot(
            pd.DataFrame(y_pred, index=df.index).resample("1t").asfreq(),
            linewidth=0.7,
        )
        axs[alg_no].set_ylim(0, 1)
        axs[alg_no].set_title(
            f"{alg[0]} ({metrics[0].__class__.__name__}: {metrics[0].get():.2f})\n"
            f"{', '.join([f'{k}: {v}' for k,v in params['SNARIMAX'].items()])}"
        )
        axs[alg_no].xaxis.set_major_locator(locator)
        axs[alg_no].xaxis.set_major_formatter(formatter)
        axs[alg_no].tick_params(axis="x", labelrotation=50, labelsize=8)

fig.tight_layout()
fig.savefig("ARIMA_opt_results.pdf")