# ***Matrix Profile*** (STUMP)

In this notebook, the Matrix Profile is evaluated. The implementation is the STUMP function of the stumpy module.

## Imports

In [2]:
import math
import os
import sys
import json
from time import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle as pkl
from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler
from pathlib import Path

In [3]:
from TSB_UAD.models.distance import Fourier
from TSB_UAD.models.feature import Window
from TSB_UAD.utils.slidingWindows import find_length, plotFig, printResult

from TSB_UAD.models.iforest import IForest

## Data Pre-Processing

In [4]:
# Load the data for the evaluation.
all_data = []

with open('dataset.pkl', 'rb') as f:
    data = pkl.load(f)

all_data.extend(data['evaluation']['single_normality'])
all_data.extend(data['evaluation']['double_normality'])
all_data.extend(data['evaluation']['triple_normality'])
all_data.extend(data['evaluation']['quadruple_normality'])

In [5]:
preprocessed_dict = {}

In [39]:
# Set the number of windows to be fit per batch.
from typing import Any


windows_per_batch = 150

for timeseries in all_data:
    
    # === Pre-processing steps ===

    # Prepare data for unsupervised method
    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    # Take the series and batch it.
    batched_data = []

    i = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data.append(data[batch_samples_begin:batch_samples_end])

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1


    # Take the series and batch it for history batching: For each batch, also append have access to the data of the previous batch.
    batched_data_previous_access = []
    i = 0
    previous_window_beginning = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data_previous_access.append(data[previous_window_beginning:batch_samples_end])

        previous_window_beginning = batch_samples_begin

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1



    # Take the windows and batch them.
    batched_X_data = []
    i = 0
    while i < len(X_data):
        begin = i
        end = i + windows_per_batch
        if end > len(X_data):
            end = len(X_data)

        batched_X_data.append(X_data[begin:end])
        i += windows_per_batch

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    # Store the pre-processed variables in the new dictionary
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'batched_X_data': batched_X_data,
        'batched_data': batched_data,
        'points_per_batch': len(batched_data[0]),
        'history_batched_data': batched_data_previous_access,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

Time-Series name: ECG1
Estimated Subsequence length:  100

Time-Series name: ECG1_20k
Estimated Subsequence length:  100

Time-Series name: IOPS1
Estimated Subsequence length:  288

Time-Series name: SMD1
Estimated Subsequence length:  125

Time-Series name: Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1
Estimated Subsequence length:  100

Time-Series name: SMD1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+Occupancy1
Estimated Subsequence length:  100

Time-Series name: SMD1+ECG1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+SMD1+Occupancy1
Estimated Subsequence length:  100



## 1. Offline Running
The STUMP algorithm is run offline, with access to all subsequences.

In [8]:
import stumpy
modelName = 'STUMP'

In [16]:
results = []

for name in (p := tqdm(preprocessed_dict.keys())):
    p.set_description(name)
    ts = preprocessed_dict[name]
    window_size = ts['slidingWindow']
    x = ts['data']
    
    k = 1
    t0 = time()
    score_ = stumpy.stump(T_A=x, m=window_size, k=k, ignore_trivial=True, normalize=True)
    t1 = time()
    score = score_.T[k-1]
    
    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
    
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    results.append([name] + L + [t1-t0, len(x)])

ECG1:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [None]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Number of anomalies', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Number of anomalies,Time,Number of Windows
0,ECG1,0.635414,0.084577,21105.0,61.204756,229900
1,ECG1_20k,0.857905,0.133333,675.0,0.2898,20000
2,IOPS1,0.719323,0.18932,206.0,0.278842,8784
3,SMD1,0.472878,0.0,2694.0,0.924781,28479
4,Occupancy1,0.169576,0.0,972.0,0.073836,2665
5,ECG1+IOPS1,0.759207,0.098751,881.0,0.963221,28784
6,SMD1+Occupancy1,0.377355,0.0,3666.0,1.070978,31144
7,ECG1+IOPS1+Occupancy1,0.765244,0.058284,1853.0,1.07439,31449
8,SMD1+ECG1+Occupancy1,0.576887,0.0,4341.0,2.025657,51144
9,ECG1+IOPS1+SMD1+Occupancy1,0.613329,0.0,4547.0,2.503679,59928


In [None]:
df.to_csv('Results/Static/STUMP-results.csv', index=False)

## 2. Online Running: Variant 1
The STUMP algorithm is run online, with access to limited subsequences. In this variant, the algorithm only has access to the subsequences of a given (current) batch.

In [None]:
results = []

for name in (p := tqdm(preprocessed_dict.keys())):
    p.set_description(name)
    ts = preprocessed_dict[name]
    window_size = ts['slidingWindow']
    x = ts['data']
    
    k = 1
    score = []
    t0 = time()
    for batch in tqdm(ts['batched_data'], desc='Processing Batch'):
        score_ = stumpy.stump(T_A=batch, m=window_size, k=k, ignore_trivial=True, normalize=True)
        score.extend(score_.T[k-1])
    t1 = time()

    # In some combinations of batch size and window size, windows overlap with all closest-distance candidates and cannot be scored.
    # In this case, inf is returned. To fix this, any instances of infinite distances are replaced with zero distance.
    score = [s if s != np.inf else 0 for s in score]
    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
    
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    results.append([name] + L + [t1-t0, len(x)])

Processing Batch: 100%|██████████| 1533/1533 [00:11<00:00, 138.71it/s]
Processing Batch: 100%|██████████| 133/133 [00:00<00:00, 138.24it/s]
Processing Batch: 100%|██████████| 57/57 [00:00<00:00, 137.60it/s]
Processing Batch: 100%|██████████| 190/190 [00:01<00:00, 142.74it/s]
Processing Batch: 100%|██████████| 17/17 [00:00<00:00, 138.87it/s]
Processing Batch: 100%|██████████| 192/192 [00:01<00:00, 140.66it/s]
Processing Batch: 100%|██████████| 207/207 [00:00<00:00, 220.19it/s]
Processing Batch: 100%|██████████| 209/209 [00:00<00:00, 224.33it/s]]
Processing Batch: 100%|██████████| 341/341 [00:01<00:00, 211.12it/s] 
Processing Batch: 100%|██████████| 399/399 [00:01<00:00, 215.92it/s]2s/it]
ECG1+IOPS1+SMD1+Occupancy1: 100%|██████████| 10/10 [00:24<00:00,  2.47s/it]


In [None]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [None]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Number of anomalies', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Number of anomalies,Time,Number of Windows
0,ECG1,0.696418,0.0,21105.0,11.055027,59928
1,ECG1_20k,0.689407,0.0,675.0,0.964218,59928
2,IOPS1,0.596923,0.0,206.0,0.417511,59928
3,SMD1,0.556622,0.0,2694.0,1.334138,59928
4,Occupancy1,0.186049,0.0,972.0,0.126538,59928
5,ECG1+IOPS1,0.708663,0.0,881.0,1.368074,59928
6,SMD1+Occupancy1,0.441063,0.0,3666.0,0.943233,59928
7,ECG1+IOPS1+Occupancy1,0.499768,0.0,1853.0,0.934797,59928
8,SMD1+ECG1+Occupancy1,0.542541,0.0,4341.0,1.61933,59928
9,ECG1+IOPS1+SMD1+Occupancy1,0.525118,0.0,4547.0,1.851899,59928


In [None]:
df.to_csv('Results/Variant_1/STUMP-results.csv', index=False)

## 3. Online Running: Variant 2: History Batching
The STUMP algorithm is run online, with access to limited subsequences. In this variant, the algorithm only has access to the subsequences of a given (current) batch and the previous batch.

In [50]:
results = []

for name in (p := tqdm(preprocessed_dict.keys())):
    p.set_description(name)
    ts = preprocessed_dict[name]
    window_size = ts['slidingWindow']
    x = ts['data']
    
    k = 1
    score = []
    t0 = time()
    for i, batch in enumerate(tqdm(ts['history_batched_data'], desc='Processing Batch')):
        score_ = stumpy.stump(T_A=batch, m=window_size, k=k, ignore_trivial=True, normalize=True)

        if i == 0:
            score.extend(score_.T[k-1])
        else:
            score.extend(list(score_.T[k-1])[windows_per_batch:])
    t1 = time()

    # In some combinations of batch size and window size, windows overlap with all closest-distance candidates and cannot be scored.
    # In this case, inf is returned. To fix this, any instances of infinite distances are replaced with zero distance.
    score = [s if s != np.inf else 0 for s in score]
    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
    
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    results.append([name] + L + [t1-t0, len(x)])

Processing Batch: 100%|██████████| 1533/1533 [00:10<00:00, 142.49it/s]
Processing Batch: 100%|██████████| 133/133 [00:00<00:00, 166.85it/s]
Processing Batch: 100%|██████████| 57/57 [00:00<00:00, 167.53it/s]
Processing Batch: 100%|██████████| 190/190 [00:01<00:00, 141.45it/s]
Processing Batch: 100%|██████████| 17/17 [00:00<00:00, 126.73it/s]
Processing Batch: 100%|██████████| 192/192 [00:01<00:00, 158.15it/s]
Processing Batch: 100%|██████████| 207/207 [00:01<00:00, 147.31it/s]
Processing Batch: 100%|██████████| 209/209 [00:01<00:00, 119.34it/s]]
Processing Batch: 100%|██████████| 341/341 [00:02<00:00, 163.95it/s] 
Processing Batch: 100%|██████████| 399/399 [00:03<00:00, 120.80it/s]9s/it]
ECG1+IOPS1+SMD1+Occupancy1: 100%|██████████| 10/10 [00:27<00:00,  2.75s/it]


In [51]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [52]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Number of anomalies', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Number of anomalies,Time,Number of Windows
0,ECG1,0.766365,0.1285,21105.0,10.761828,229900
1,ECG1_20k,0.818412,0.543704,675.0,0.80023,20000
2,IOPS1,0.377768,0.0,206.0,0.343244,8784
3,SMD1,0.452451,0.0,2694.0,1.349273,28479
4,Occupancy1,0.245754,0.0,972.0,0.138234,2665
5,ECG1+IOPS1,0.782539,0.322361,881.0,1.218021,28784
6,SMD1+Occupancy1,0.367379,0.0,3666.0,1.409188,31144
7,ECG1+IOPS1+Occupancy1,0.647434,0.148408,1853.0,1.755493,31449
8,SMD1+ECG1+Occupancy1,0.575929,0.0,4341.0,2.083867,51144
9,ECG1+IOPS1+SMD1+Occupancy1,0.592921,0.0,4547.0,3.306046,59928


In [55]:
print(df[['Name', 'AUC', 'Precision@k', 'Number of anomalies', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrrr}
\toprule
                      Name &      AUC &  Precision@k &  Number of anomalies &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.766365 &     0.128500 &              21105.0 & 10.761828 &             229900 \\
                  ECG1\_20k & 0.818412 &     0.543704 &                675.0 &  0.800230 &              20000 \\
                     IOPS1 & 0.377768 &     0.000000 &                206.0 &  0.343244 &               8784 \\
                      SMD1 & 0.452451 &     0.000000 &               2694.0 &  1.349273 &              28479 \\
                Occupancy1 & 0.245754 &     0.000000 &                972.0 &  0.138234 &               2665 \\
                ECG1+IOPS1 & 0.782539 &     0.322361 &                881.0 &  1.218021 &              28784 \\
           SMD1+Occupancy1 & 0.367379 &     0.000000 &               3666.0 &  1.409188 &              31144 \\
     ECG1+IOPS1+Occupancy1 & 0.647434 &     0.148408 &       

In [56]:
df.to_csv('Results/Matrix Profile/STUMP-variant2', index=False)