In [None]:
# IMPORTS
import datetime as dt
import os
import sys
import warnings

from pathlib import Path
from functools import partial

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from river import anomaly, preprocessing, utils
from river.metrics import F1, Precision, Recall, MacroF1, ClassificationReport
from sklearn.decomposition import PCA

sys.path. insert(1, str(Path().resolve().parent))
from functions.anomaly import ConditionalGaussianScorer
from functions.compose import build_model, convert_to_nested_dict
from functions.proba import MultivariateGaussian
from functions.evaluate import print_stats, progressive_val_predict

# FUNCTIONS
def tune_train_model(steps, df, val_kwargs: dict = {}, **params):
    params = convert_to_nested_dict(params)
    model = build_model(steps, params)
    metrics = [MacroF1().clone()]
    try:
        val_kwargs.update(params.get("Val", {}))
        progressive_val_predict(model, df, metrics, print_every=0,  print_final=False, **val_kwargs)

        return metrics[0].get()
    except Exception as e:
        print(e)
        return 0


def get_random_samples(df: pd.DataFrame, num_samples=10000):
    if len(df) <= num_samples:
        return df
    else:
        return df.sample(n=num_samples, random_state=42)


def plot_detection(df: pd.DataFrame, y_pred):
    df['pred'] = y_pred
    if 'anomaly' in df.columns:
        df = get_random_samples(df)
        if len(df.columns) >= 4:
            # Separate the feature columns from the target column ("anomaly")
            X = df.drop(columns=['anomaly', 'pred'])
            y = df['anomaly']
            y_pred = df['pred']

            # Apply PCA to reduce the feature columns to 2 components
            pca = PCA(n_components=2)
            X_pca = pca.fit_transform(X)

            # Create a new DataFrame with the reduced components and "anomaly" column
            df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
            df_pca['anomaly'] = y.values
            df_pca['pred'] = y_pred.values
        else:
            print(True)
            df_pca = pd.DataFrame(
                df.reset_index().copy()
                )
            df_pca.columns = ['PC1', 'PC2', 'anomaly', 'pred']

        # Plot the 2D scatter plot
        plt.scatter(df_pca[df_pca['anomaly'] == 0]['PC1'], df_pca[df_pca['anomaly'] == 0]['PC2'])
        plt.scatter(df_pca[df_pca['anomaly'] == 1]['PC1'], df_pca[df_pca['anomaly'] == 1]['PC2'], facecolors='none', edgecolors='r', linewidths=0.5)
        plt.scatter(df_pca[df_pca['pred'] == 1]['PC1'], df_pca[df_pca['pred'] == 1]['PC2'], marker='x', linewidths=1)
        plt.xticks(())
        plt.yticks(())


def save_results_y(df, y_pred, change_point, path):
    df_y = pd.concat([pd.Series(df.anomaly.values), pd.Series(y_pred).astype(int), pd.Series(change_point)], axis=1)
    df_y.columns = ['true', 'pred', 'change']

    dir_path = path
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    df_y.to_csv(f"{dir_path}/y.csv", index=False)


def save_results_metrics(metrics_res, path):
    dir_path = path
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)
    metrics_res.to_csv(f"{dir_path}/metrics.csv")

# MODS
class QuantileFilter(anomaly.QuantileFilter):
  def __init__(self, anomaly_detector, q: float, protect_anomaly_detector=True):
        super().__init__(
            anomaly_detector=anomaly_detector,
            protect_anomaly_detector=protect_anomaly_detector,
            q=q
        )
  def predict_one(self, *args):
    score = self.score_one(*args)
    return score >= (self.quantile.get() or np.inf)

# SETTINGS

# DETECTION ALGORITHMS
detection_algorithms = [
    (   
        "Half-Space Trees",
        [preprocessing.MinMaxScaler, [QuantileFilter, anomaly.HalfSpaceTrees]],
        {
            "QuantileFilter__q": (0.9, 0.99735),
            "HalfSpaceTrees__n_trees__round": (5, 15)
            }
        ),
    (   
        "Conditional Gaussian Scorer",
        [[ConditionalGaussianScorer,[utils.Rolling, MultivariateGaussian]]],
        {
            "Rolling__window_size__round": (150 , 500),
            "ConditionalGaussianScorer__grace_period__round": (50, 1000),
            "ConditionalGaussianScorer__threshold": (0.80, 0.99935),
            "ConditionalGaussianScorer__t_a__int": (50, 1000)
            }
        ),
    (   "One-Class SVM",
        [preprocessing.StandardScaler, [QuantileFilter, anomaly.OneClassSVM]],
        {
            "QuantileFilter__q": (0.9, 0.99735)
            }
        ),
]

# DATASETS
datasets = [
    {
        "name": "Load Balancing",
        "data": pd.read_csv(
            "data/load_balancing.csv", index_col=0).dropna(axis=0),
        "anomaly_col": "anomaly",
        "drop": "MPC:Request Status Code"},
    {
        "name": "YAHOO",
        "data": pd.read_csv(
            "data/multivariate/yahoo_sub_5.csv",
            index_col=0).dropna(axis=0),
        "anomaly_col": "is_anomaly",
        "drop": None},
    {
        "name": "SKAB",
        "data": pd.read_csv(
            "data/multivariate/alldata_skab.csv",
            index_col=0).dropna(axis=0),
        "anomaly_col": "is_anomaly",
        "drop": "changepoint"},
    {
        "name": "Room Occupancy",
        "data": pd.read_csv(
            "data/multivariate/Occupancy/room-occupancy-1.test.csv",
            index_col=0).dropna(axis=0),
        "anomaly_col": "is_anomaly",
        "drop": None},
    {
        "name": "Archive",
        "data": pd.read_csv(
            "data/multivariate/archive/TimeSeries.csv"
            ).head(80000).tail(20000),
        "anomaly_col": pd.read_csv(
            "data/multivariate/archive/labelsTimeSeries.csv"
            ).head(80000).tail(20000)['label'],
        "drop": None},
]

# METRICS
metrics = [
    Precision(), Recall(), F1(),
    ClassificationReport(),
]
metrics_res = pd.DataFrame(columns=[
    'Precision', 'Recall', 'F1',
    'MacroPrecision', 'MacroRecall', 'MacroF1',
    'WeightedPrecision', 'WeightedRecall', 'WeightedF1'])

# PLOT CONFIG
plt.figure(figsize=(len([1,1,1]) * 2 + 4, 12.5))
plt.subplots_adjust(
    left=0.02, right=0.98, bottom=0.001, top=0.96, wspace=0.05, hspace=0.01
)
plot_num = 1

# RUN
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for dataset in datasets:
        # PREPROCESS DATA
        df = dataset["data"]
        df.index = pd.to_timedelta(
            range(0, len(df)), 'T') + dt.datetime.now().replace(microsecond=0)
        if isinstance(dataset["anomaly_col"], str):
            df = df.rename(columns={"is_anomaly": "anomaly"})
        elif isinstance(dataset["anomaly_col"], pd.Series):
            df_y = dataset["anomaly_col"]
            df_y.name = 'anomaly'
            df['anomaly'] = df_y.values
        if dataset["drop"] is not None:
            df = df.drop(columns=dataset["drop"])
        print(f"\n=== {dataset['name']} === [{sum(df['anomaly'])}/{len(df)}]"
            .ljust(80, '='))
        # RUN EACH MODEL AGAINST DATASET
        for alg in detection_algorithms:
            print(f"\n===== {alg[0]}".ljust(80, '='))
            # TUNE HYPERPARAMETERS
            pbounds = alg[2]
            mod_fun = partial(tune_train_model, alg[1], df, {})
            optimizer = BayesianOptimization(
                f=mod_fun,
                pbounds=pbounds,
                verbose=2,
                random_state=1,
                allow_duplicate_points=True
            )
            logger = JSONLogger(path=f"./.results/{dataset['name']}-{alg[0]}.log")
            optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)
            optimizer.maximize(init_points=10, n_iter=10)
            params = convert_to_nested_dict(optimizer.max["params"])
            print(params)
            model = build_model(alg[1], params)
            # USE TUNED MODEL
            metrics_ = [metric.clone() for metric in metrics]
            # PROGRESSIVE PREDICT
            if len(alg) == 4:
                kwargs = alg[3]
            else:
                kwargs = {}
            kwargs.update(params.get("Val", {}))
            y_pred, change_point, _, _ = (
                progressive_val_predict(model, df, metrics_, print_every=0,
                                        **kwargs))
            
            # LOAD RESULTS
            #  Save
            dir_path = f".results/{dataset['name']}/{alg[0]}"
            save_results_y(df, y_pred, change_point,
                           f".results/{dataset['name']}/{alg[0]}")
            cr = metrics_.pop(-1)
            
            metrics_res.loc[alg[0]] = (
                [metric.get() for metric in metrics_] + [
                    cr._macro_precision.get(),
                    cr._macro_recall.get(),
                    cr._macro_f1.get(),
                    cr._weighted_precision.get(),
                    cr._weighted_recall.get(),
                    cr._weighted_f1.get()
                    ])
            #  Print
            print_stats(df, y_pred, change_point)
            plt.subplot(len(datasets), len(detection_algorithms), plot_num)
            plot_detection(df, y_pred)
            plot_num +=1
        save_results_metrics(metrics_res, f".results/{dataset['name']}")

plt.show()