In [79]:
import pandas as pd
import glob
from matplotlib import pyplot
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.lines import Line2D

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_fscore_support

import warnings
import os
import datetime
import time
import funcy
import copy



In [35]:
warnings.filterwarnings("ignore")

In [36]:
# this is helper method for TS rendering with datapoints
# visualize FP and FN on time series
def ts_confusion_visualization(data_test, pred_val, dataset, filename, modelname):
    x, y, true_val = data_test['timestamp'].tolist(), data_test['value'].tolist(), data_test['is_anomaly'].tolist()
    try:
        x = [datetime.datetime.strptime(x, '%m/%d/%Y %H:%M') for x in x]
    except:
        try:
            x = [datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in x]
        except:
            pass

    fp = [(x[i], y[i]) for i in range(len(true_val)) if true_val[i] == 0 and pred_val[i] == 1]
    fn = [(x[i], y[i]) for i in range(len(true_val)) if true_val[i] == 1 and pred_val[i] == 0]

    fig, ax = plt.subplots()
    ax.plot(x, y, color='grey', lw=0.5, zorder=0)
    ax.scatter([t[0] for t in fp], [t[1] for t in fp], color='r', s=5, zorder=5)
    ax.scatter([t[0] for t in fn], [t[1] for t in fn], color='y', s=5, zorder=5)

    legend_elements = [Line2D([0], [0], color='k', lw=2, label='Correct'),
                       Line2D([0], [0], marker='o', color='r', markersize=5, label='FP'),
                       Line2D([0], [0], marker='o', color='y', markersize=5, label='FN')]

    ax.legend(handles=legend_elements, loc='best')

    pyplot.savefig(f'../results/imgs/{modelname}_{dataset}_{filename}.png')
    pyplot.clf()
    pyplot.close('all')
    plt.close('all')
    del fig
    del ax

# Set hyperparameters for train flow

In [88]:
model_name = 'sr'

# decide on windwos with Lorena
anomaly_window = 65
test_window = 65
data_in_memory_size = 3000
for_optimization = False

use_drift_adapt = False
drift_detector = None
use_entropy = False
threshold_type = 'static'
if use_entropy:
    threshold_type = 'dynamic'

# TODO discuss averaging
# class averaging type for evaluation metrics calculations
# Calculate metrics globally by counting the total true positives, false negatives and false positives.
avg_type = 'micro'

# Create SR model

In [83]:
# import SR as a module from ../utils/sr/
import sys
!{sys.executable} -m pip install ../utils/sr/

Processing c:\users\oxifl\documents\uni\anomaly_detection\utils\sr
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: msanomalydetector
  Building wheel for msanomalydetector (setup.py): started
  Building wheel for msanomalydetector (setup.py): finished with status 'done'
  Created wheel for msanomalydetector: filename=msanomalydetector-1.2-cp37-cp37m-win_amd64.whl size=85901 sha256=6ac360b45f03e900fd8120a76a40d8a4146c992707ec7ea27c01ef7072bf5551
  Stored in directory: C:\Users\oxifl\AppData\Local\Temp\pip-ephem-wheel-cache-tx9o1gm5\wheels\55\14\b0\81eaf4ba37022705843ac4c1ee41c3a2d69ded4bc0815ed2e6
Successfully built msanomalydetector
Installing collected packages: msanomalydetector
  Attempting uninstall: msanomalydetector
    Found existing installation: msanomalydetector 1.2
    Uninstalling msanomalydetector-1.2:
      Successfully uninstalled msanomalydetector-1.2
Successfully installed msa

You should consider upgrading via the 'c:\Users\oxifl\AppData\Local\Programs\Python\Python37\python.exe -m pip install --upgrade pip' command.


In [84]:
from msanomalydetector import THRESHOLD, MAG_WINDOW, SCORE_WINDOW
from msanomalydetector import DetectMode
from msanomalydetector.spectral_residual import SpectralResidual

####################################################################################
# this code is taken from
# https://github.com/microsoft/anomalydetector
# with modifications to account for sliding windows and entropy thresholding
# the modified code is worj from:
# https://github.com/nata1y/tiny-anom-det/tree/main/models/sr
####################################################################################

# initial parameters from the paper
sr_model_params = (THRESHOLD, MAG_WINDOW, SCORE_WINDOW, 99)

# Run experiments

In [85]:
# an example of a general dataset path
path_files_yahoo = '../datasets/Yahoo_A1Benchmark/'

In [89]:
# 'kpi', 'NAB_realAWSCloudwatch'
for dataset in ['Yahoo_A1Benchmark']:

    train_data_path = f'../datasets/{dataset}/'
    
    for filename in os.listdir(train_data_path):
        f = os.path.join(train_data_path, filename)
        data = pd.read_csv(f)

        filename = filename.replace('.csv', '')
        print(f'Working with current time series: {filename} in dataset {dataset}')

        data.rename(columns={'timestamps': 'timestamp', 'anomaly': 'is_anomaly'}, inplace=True)
        data.drop_duplicates(subset=['timestamp'], keep=False, inplace=True)

        # timestamp preprocessing for kpi -- their are unix timestamps
        if dataset == 'kpi':
            data_test = pd.read_csv(os.path.join(f'../datasets/{dataset}/test/', filename))
            data_test['timestamp'] = data_test['timestamp'].apply(
                lambda x: datetime.datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
            data['timestamp'] = data['timestamp'].apply(
                lambda x: datetime.datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S'))
            
            # kpi stores train and test in different ts -- merge them into one to follow structure
            data = pd.concat([data, data_test], ignore_index=True)

        # 50-50 train/test split
        data_train, data_test = np.array_split(data, 2)

        # train model #######################################################################################
        start = time.time()
        
        model = SpectralResidual(series=data_train[['value', 'timestamp']], use_drift=use_drift_adapt,
                                 threshold=sr_model_params[0], mag_window=sr_model_params[1],
                                 score_window=sr_model_params[2], sensitivity=sr_model_params[3],
                                 detect_mode=DetectMode.anomaly_only, dataset=dataset,
                                 filename=filename, drift_detector=drift_detector,
                                 data_in_memory_sz=data_in_memory_size, anomaly_window=anomaly_window)

        model.fit()
        
        end_time = time.time()

        diff = end_time - start
        print(f"Trained SR on {filename} for {diff}")

        # test model #########################################################################################
        batch_metrices_f1_entropy = []
        batch_metrices_f1_no_entropy = []
        y_pred_total_no_entropy, y_pred_total_entropy = [], []
        batches_with_anomalies = []
        idx = 0

        pred_time = []

        if for_optimization:
            data_in_memory = pd.DataFrame([])
        else:
            data_in_memory = copy.deepcopy(data_train)

        for start in range(0, data_test.shape[0], anomaly_window):
            try:
                # current window on which TESTING and SCORING is applied
                window = data_test.iloc[start:start + anomaly_window]
                # data hold in memory, calculations and predictions are performed across this window
                data_in_memory = pd.concat([data_in_memory, window])[-data_in_memory_size:]

                X, y = window['value'], window['is_anomaly']
                if y.tolist():
                    if model_name == 'sr':
                        model.__series__ = data_in_memory
                        try:
                            res = model.predict(data_in_memory, window.shape[0])
                            y_pred_noe = [1 if x else 0 for x in res['isAnomaly'].tolist()]

                            y_pred_e = [1 if x else 0 for x in res['isAnomaly_e'].tolist()]
                        except Exception as e:
                            y_pred_noe = [0 for _ in range(window.shape[0])]
                            y_pred_e = [0 for _ in range(window.shape[0])]

                    idx += 1
                    y_pred_total_no_entropy += [0 if val != 1 else 1 for val in funcy.lflatten(y_pred_noe)][:window.shape[0]]
                    y_pred_total_entropy += [0 if val != 1 else 1 for val in funcy.lflatten(y_pred_e)][:window.shape[0]]
                    y_pred_noe = y_pred_noe[:window.shape[0]]
                    y_pred_e = y_pred_e[:window.shape[0]]

            except Exception as e:
                print(f"An awful exception has occured during testing: \n{repr(e)}")

        # evaluate TS ########################################################################################

        # calculate batched metrics per *test_window*
        # for test stats we calculate F1 score for eacj class but use score for anomaly label 
        # this works because we have binary classification
        data_reset = data_test.reset_index()['is_anomaly']
        for i in range(0, len(data_test['is_anomaly']), test_window):
            # here, met_total will be (precision, recall, f1_score, support)
            # 
            met_total = precision_recall_fscore_support(data_reset[i:i+test_window],
                                                        y_pred_total_entropy[:data_test.shape[0]][i:i+test_window])
            batch_metrices_f1_entropy.append(met_total[2][-1])

            met_total = precision_recall_fscore_support(data_reset[i:i+test_window],
                                                        y_pred_total_no_entropy[:data_test.shape[0]][i:i+test_window])
            batch_metrices_f1_no_entropy.append(met_total[2][-1])

        met_total_no_entropy = precision_recall_fscore_support(data_test['is_anomaly'],
                                                        y_pred_total_no_entropy[:data_test.shape[0]])
        met_total_entropy = precision_recall_fscore_support(data_test['is_anomaly'],
                                                    y_pred_total_entropy[:data_test.shape[0]])

        # add entry to stats #######################################################################################

        try:
            stats_full = pd.read_csv(f'../results/scores/sr_{dataset}_stats.csv')
        except:
            stats_full = pd.DataFrame([])

        stats_full = stats_full.append({
            'model': model_name,
            'dataset': filename,
            'window': anomaly_window,
            'f1-entropy': met_total_entropy[2][-1],
            'f1-no_entropy': met_total_no_entropy[2][-1],
        }, ignore_index=True)
        stats_full.to_csv(f'../results/scores/sr_{dataset}_stats.csv', index=False)
        print(f'F1 score is: {met_total_no_entropy[2][-1]}')

        # plotting ##################################################################################################
        # general on ts
        if use_entropy:
            ts_confusion_visualization(data_test, y_pred_total_entropy, dataset, filename, model_name)
        else:
            ts_confusion_visualization(data_test, y_pred_total_no_entropy, dataset, filename, model_name)

        # model-specific rendering of internal workings
        # use y_pred_total_e (for entropy threshold) or
        # y_pred_total_noe for non entropy threshold
        # model.plot(data_test, threshold_type=threshold_type)


Working with current time series: real_1 in dataset Yahoo_A1Benchmark
Trained SR on real_1 for 0.060999393463134766
F1 score is: [0.99788285 0.        ]
Working with current time series: real_10 in dataset Yahoo_A1Benchmark
Trained SR on real_10 for 0.12199902534484863
F1 score is: [0.98945889 0.        ]
Working with current time series: real_11 in dataset Yahoo_A1Benchmark
Trained SR on real_11 for 0.04000043869018555
F1 score is: [0.98728814 0.18181818]
Working with current time series: real_12 in dataset Yahoo_A1Benchmark
Trained SR on real_12 for 0.03799867630004883
F1 score is: [0.99790649 0.4       ]
Working with current time series: real_13 in dataset Yahoo_A1Benchmark
Trained SR on real_13 for 0.042002201080322266
F1 score is: [0.9943899  0.33333333]
Working with current time series: real_14 in dataset Yahoo_A1Benchmark
Trained SR on real_14 for 0.040026187896728516
F1 score is: [0.99440559 0.        ]
Working with current time series: real_15 in dataset Yahoo_A1Benchmark
Trai

KeyboardInterrupt: 