# Variant 1: Outlier detection per-batch only

In this notebook, the two outlier detection methods selected (Isolation Forest and STUMP/Matrix Profile) are evaluated in a streaming environment where they are only applied in the current batch. This means that they are not given any other information besides the windows assigned in the batch they are given in each step.

## ***Importing Libraries***

In [35]:
import math
import os
import sys
import json
from time import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle as pkl
from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler
from pathlib import Path

In [36]:
from TSB_UAD.models.distance import Fourier
from TSB_UAD.models.feature import Window
from TSB_UAD.utils.slidingWindows import find_length, plotFig, printResult

from TSB_UAD.models.iforest import IForest

## ***Data Pre-Processing***

In [37]:
# Load the data for the evaluation.
all_data = []

with open('dataset.pkl', 'rb') as f:
    data = pkl.load(f)

all_data.extend(data['evaluation']['single_normality'])
all_data.extend(data['evaluation']['double_normality'])
all_data.extend(data['evaluation']['triple_normality'])
all_data.extend(data['evaluation']['quadruple_normality'])

In [38]:
preprocessed_dict = {}

In [53]:
# Set the number of windows to be fit per batch.
windows_per_batch = 150

for timeseries in all_data:
    
    # === Pre-processing steps ===

    # Prepare data for unsupervised method
    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    # Take the series and batch it.
    batched_data = []

    i = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data.append(data[batch_samples_begin:batch_samples_end])

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1

    # Take the windows and batch them.
    batched_X_data = []
    i = 0
    while i < len(X_data):
        begin = i
        end = i + windows_per_batch
        if end > len(X_data):
            end = len(X_data)

        batched_X_data.append(X_data[begin:end])
        i += windows_per_batch

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    # Store the pre-processed variables in the new dictionary
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'batched_X_data': batched_X_data,
        'batched_data': batched_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

Time-Series name: ECG1
Estimated Subsequence length:  100

Time-Series name: ECG1_20k
Estimated Subsequence length:  100

Time-Series name: IOPS1
Estimated Subsequence length:  288

Time-Series name: SMD1
Estimated Subsequence length:  125

Time-Series name: Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1
Estimated Subsequence length:  100

Time-Series name: SMD1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+Occupancy1
Estimated Subsequence length:  100

Time-Series name: SMD1+ECG1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+SMD1+Occupancy1
Estimated Subsequence length:  100



## ***Anomaly Detection***

### ***Isolation Forest***

In [8]:
from tqdm.notebook import tqdm_notebook as tqdm

modelName = 'IForest'
clf = IForest(n_jobs=10)

In [9]:
results = []

for name in (p := tqdm(preprocessed_dict.keys())):
    p.set_description(name)
    ts = preprocessed_dict[name]
    x = ts['X_data']

    score = []
    t0 = time()
    for batch in tqdm(ts['batched_X_data'], desc='Processing Batch'):
        # Sometimes the last batch has only one window. That is a problem because with only a window, the problem of finding if that window is an outlier or not can be undefined. 
        # In that case, skip evaluating that window and assign the same score as the last window from the previous batch.
        if len(batch) == 1:
            score.append(score[-1])
        else:
            clf.fit(batch)
            score.extend(clf.decision_scores_)
    t1 = time()
    
    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
    
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    results.append([name] + L + [t1-t0, len(x)])

  0%|          | 0/10 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/1533 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/133 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/57 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/190 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/17 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/192 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/207 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/209 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/341 [00:00<?, ?it/s]

Processing Batch:   0%|          | 0/399 [00:00<?, ?it/s]

In [10]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [11]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Number of anomalies', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Number of anomalies,Time,Number of Windows
0,ECG1,0.853721,0.134944,21105.0,264.860036,229801
1,ECG1_20k,0.860271,0.291852,675.0,24.1784,19901
2,IOPS1,0.502774,0.004854,206.0,11.123778,8497
3,SMD1,0.368101,0.002598,2694.0,33.422409,28355
4,Occupancy1,0.74131,0.065844,972.0,3.136758,2541
5,ECG1+IOPS1,0.669168,0.0,881.0,33.746802,28685
6,SMD1+Occupancy1,0.508252,0.032733,3666.0,36.66109,31020
7,ECG1+IOPS1+Occupancy1,0.750467,0.033999,1853.0,37.075293,31350
8,SMD1+ECG1+Occupancy1,0.646462,0.03179,4341.0,59.090078,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.600558,0.016494,4547.0,68.622697,59829


In [12]:
df.to_csv('Results/Variant_1/IsolationForest-results.csv', index=False)

### ***STUMP***

In [13]:
import stumpy
modelName = 'STUMP'

In [54]:
results = []

for name in (p := tqdm(preprocessed_dict.keys())):
    p.set_description(name)
    ts = preprocessed_dict[name]
    window_size = ts['slidingWindow']
    
    k = 1
    score = []
    t0 = time()
    for batch in tqdm(ts['batched_data'], desc='Processing Batch'):
        score_ = stumpy.stump(T_A=batch, m=window_size, k=k, ignore_trivial=True, normalize=True)
        score.extend(score_.T[k-1])
    t1 = time()

    # In some combinations of batch size and window size, windows overlap with all closest-distance candidates and cannot be scored.
    # In this case, inf is returned. To fix this, any instances of infinite distances are replaced with zero distance.
    score = [s if s != np.inf else 0 for s in score]
    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
    
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    results.append([name] + L + [t1-t0, len(x)])

Processing Batch: 100%|██████████| 1533/1533 [00:08<00:00, 175.40it/s]
Processing Batch: 100%|██████████| 133/133 [00:00<00:00, 199.03it/s]
Processing Batch: 100%|██████████| 57/57 [00:00<00:00, 147.36it/s]
Processing Batch: 100%|██████████| 190/190 [00:01<00:00, 159.91it/s]
Processing Batch: 100%|██████████| 17/17 [00:00<00:00, 135.93it/s]
Processing Batch: 100%|██████████| 192/192 [00:01<00:00, 180.14it/s]
Processing Batch: 100%|██████████| 207/207 [00:01<00:00, 155.14it/s]
Processing Batch: 100%|██████████| 209/209 [00:01<00:00, 179.69it/s]]
Processing Batch: 100%|██████████| 341/341 [00:01<00:00, 172.30it/s] 
Processing Batch: 100%|██████████| 399/399 [00:02<00:00, 188.20it/s]5s/it]
ECG1+IOPS1+SMD1+Occupancy1: 100%|██████████| 10/10 [00:22<00:00,  2.29s/it]


In [55]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [56]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Number of anomalies', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Number of anomalies,Time,Number of Windows
0,ECG1,0.696418,0.0,21105.0,8.74584,59829
1,ECG1_20k,0.689407,0.0,675.0,0.672401,59829
2,IOPS1,0.596923,0.0,206.0,0.389822,59829
3,SMD1,0.556622,0.0,2694.0,1.192813,59829
4,Occupancy1,0.186049,0.0,972.0,0.129206,59829
5,ECG1+IOPS1,0.708663,0.0,881.0,1.068098,59829
6,SMD1+Occupancy1,0.441063,0.0,3666.0,1.336303,59829
7,ECG1+IOPS1+Occupancy1,0.499768,0.0,1853.0,1.167147,59829
8,SMD1+ECG1+Occupancy1,0.542541,0.0,4341.0,1.983109,59829
9,ECG1+IOPS1+SMD1+Occupancy1,0.525118,0.0,4547.0,2.123133,59829


In [57]:
df.to_csv('Results/Variant_1/STUMP-results.csv', index=False)