# ***Libraries***

In [1]:
import math
import os
import sys
import json
from time import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle as pkl
from time import time

from sklearn.preprocessing import MinMaxScaler

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [3]:
sys.path.append(parent_dir)

In [4]:
from TSB_UAD.models.distance import Fourier
from TSB_UAD.models.feature import Window
from TSB_UAD.utils.slidingWindows import find_length, plotFig, printResult

from TSB_UAD.models.iforest import IForest

# ***Isolation Forest***


## ***Data Pre-Processing***

We utilize the following time-series:

- ECG1
- ECG1_20k
- IOPS1
- SMD1
- Occupancy1
- ECG1+IOPS1
- SMD1+Occupancy1
- ECG1+IOPS1+Occupancy1
- SMD1+ECG1+Occupancy1
- ECG1+IOPS1+SMD1+Occupancy1

In [5]:
# Load the data for the evaluation.
all_data = []

with open('dataset.pkl', 'rb') as f:
    data = pkl.load(f)

all_data.extend(data['evaluation']['single_normality'])
all_data.extend(data['evaluation']['double_normality'])
all_data.extend(data['evaluation']['triple_normality'])
all_data.extend(data['evaluation']['quadruple_normality'])

In [38]:
preprocessed_dict = {}

### ***Pre-processing for non-streaming***
Simple data pre-processing based on TSB-UAD. This pre-processing serves as the pre-processing baseline

In [7]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

Time-Series name: ECG1
Estimated Subsequence length:  100

Time-Series name: ECG1_20k
Estimated Subsequence length:  100

Time-Series name: IOPS1
Estimated Subsequence length:  288

Time-Series name: SMD1
Estimated Subsequence length:  125

Time-Series name: Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1
Estimated Subsequence length:  100

Time-Series name: SMD1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+Occupancy1
Estimated Subsequence length:  100

Time-Series name: SMD1+ECG1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+SMD1+Occupancy1
Estimated Subsequence length:  100



### ***Pre-processing for both naive streaming variant and streaming variant with batch history***

In [21]:
# Set the number of windows to be fit per batch.
windows_per_batch = 150

for timeseries in all_data:
    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    # Take the series and batch it.
    batched_data = []

    i = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data.append(data[batch_samples_begin:batch_samples_end])

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1

    # Take the windows and batch them.
    batched_X_data = []
    i = 0
    while i < len(X_data):
        begin = i
        end = i + windows_per_batch
        if end > len(X_data):
            end = len(X_data)

        batched_X_data.append(X_data[begin:end])
        i += windows_per_batch

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    # Store the pre-processed variables in the new dictionary
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'batched_X_data': batched_X_data,
        'batched_data': batched_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

Time-Series name: ECG1
Estimated Subsequence length:  100

Time-Series name: ECG1_20k
Estimated Subsequence length:  100

Time-Series name: IOPS1
Estimated Subsequence length:  288

Time-Series name: SMD1
Estimated Subsequence length:  125

Time-Series name: Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1
Estimated Subsequence length:  100

Time-Series name: SMD1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+Occupancy1
Estimated Subsequence length:  100

Time-Series name: SMD1+ECG1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+SMD1+Occupancy1
Estimated Subsequence length:  100



### ***Pre-processing for streaming variant with dynamic partitioning (change point detection)***
Naively partitioning the data is not a reliable solution. We want to partition the data as soon as an abrupt change occurs. For that, we can use:
- 1. MinMax range partitioning
- 2. Percentile Partitioning

#### ***MinMax range partitioning***

In [28]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data)

    initial_partition_length = 500
    initial_partition = data[:initial_partition_length]

    max = np.max(initial_partition)
    min = np.min(initial_partition)

    data_partitions = [initial_partition]
    current_partition = []
    change_detected = False

    p = 500
    change_point_threshold = 0.8
    exceed_threshold = 0.5
    post_change_points = []

    for point in data[len(initial_partition):]:
        
        # Check for significant change
        if (point > max * (1 + change_point_threshold)) or (point < min * (1 - change_point_threshold)):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt > max * (1 + change_point_threshold) or pt < min * (1 - change_point_threshold)) for pt in post_change_points]
                if sum(exceeds_threshold_points) >= exceed_threshold * p:
                    max = np.mean([max] + [pt for pt in post_change_points if pt > max])
                    min = np.mean([min] + [pt for pt in post_change_points if pt < min])

                post_change_points = []

                # Add the current partition to data partitions
                data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to data_partitions
    if current_partition:
        data_partitions.append(np.array(current_partition))

    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'data partitions': data_partitions,
        'global_sliding_window': global_sw,
    }

Let's see the number of partitions created for each time-series

In [29]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    print(f"Number of partitions: {len(ts['data partitions'])} for file: {ts['name']}")

Number of partitions: 420 for file: ECG1
Number of partitions: 38 for file: ECG1_20k
Number of partitions: 3 for file: IOPS1
Number of partitions: 24 for file: SMD1
Number of partitions: 2 for file: Occupancy1
Number of partitions: 39 for file: ECG1+IOPS1
Number of partitions: 25 for file: SMD1+Occupancy1
Number of partitions: 40 for file: ECG1+IOPS1+Occupancy1
Number of partitions: 64 for file: SMD1+ECG1+Occupancy1
Number of partitions: 40 for file: ECG1+IOPS1+SMD1+Occupancy1


Are the size of the partitions consistent with the initial size of the time-series?

In [30]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    par_size = 0
    for partition in ts['data partitions']:
        par_size += len(partition)
    
    print(f"Total size of partitions: {par_size} for file: {ts['name']}. Original data size: {len(ts['data'])}")

Total size of partitions: 229900 for file: ECG1. Original data size: 229900
Total size of partitions: 20000 for file: ECG1_20k. Original data size: 20000
Total size of partitions: 8784 for file: IOPS1. Original data size: 8784
Total size of partitions: 28479 for file: SMD1. Original data size: 28479
Total size of partitions: 2665 for file: Occupancy1. Original data size: 2665
Total size of partitions: 28784 for file: ECG1+IOPS1. Original data size: 28784
Total size of partitions: 31144 for file: SMD1+Occupancy1. Original data size: 31144
Total size of partitions: 31449 for file: ECG1+IOPS1+Occupancy1. Original data size: 31449
Total size of partitions: 51144 for file: SMD1+ECG1+Occupancy1. Original data size: 51144
Total size of partitions: 59928 for file: ECG1+IOPS1+SMD1+Occupancy1. Original data size: 59928


In [None]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    if len(ts['data partitions']) < 10:

        fig, axes = plt.subplots(1, len(ts['data partitions']), figsize=(20, 5))

        for i, array in enumerate(ts['data partitions']):
            axes[i].plot(array)
            axes[i].set_title(f"Partition {i+1} of {ts['name']}")

        plt.tight_layout()
        plt.show()

#### ***Percentile Partitioning***

In [39]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data)


    # Filter the normal points (label == 0)
    #normal_indices = [i for i, lbl in enumerate(label) if lbl == 0]
    #normal_data = data[normal_indices]

    #normal_data_par_length = int(len(normal_data) * 0.10)
    #normal_data = normal_data[:normal_data_par_length]

    initial_partition_length = 500
    initial_partition = data[:initial_partition_length]

    # Compute initial percentiles
    percentile_5 = np.percentile(initial_partition, 5)
    percentile_95 = np.percentile(initial_partition, 95)

    data_partitions = [initial_partition]
    current_partition = []
    change_detected = False
    p = 500
    exceed_threshold = 0.5
    post_change_points = []

    for point in data[initial_partition_length:]:
        
        # Check for significant change
        if (point < percentile_5) or (point > percentile_95):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt < percentile_5 or pt > percentile_95) for pt in post_change_points]
                if sum(exceeds_threshold_points) / p >= exceed_threshold:
                    # Update percentiles
                    percentile_5 = np.percentile(post_change_points, 5)
                    percentile_95 = np.percentile(post_change_points, 95)

                post_change_points = []
                # Add the current partition to data partitions
                data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to data_partitions
    if current_partition:
        data_partitions.append(np.array(current_partition))

    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'data partitions': data_partitions,
        'global_sliding_window': global_sw,
    }

In [40]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    print(f"Number of partitions: {len(ts['data partitions'])} for file: {ts['name']}")

Number of partitions: 433 for file: ECG1
Number of partitions: 39 for file: ECG1_20k
Number of partitions: 17 for file: IOPS1
Number of partitions: 46 for file: SMD1
Number of partitions: 4 for file: Occupancy1
Number of partitions: 55 for file: ECG1+IOPS1
Number of partitions: 51 for file: SMD1+Occupancy1
Number of partitions: 58 for file: ECG1+IOPS1+Occupancy1
Number of partitions: 86 for file: SMD1+ECG1+Occupancy1
Number of partitions: 107 for file: ECG1+IOPS1+SMD1+Occupancy1


In [41]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    par_size = 0
    for partition in ts['data partitions']:
        par_size += len(partition)
    
    print(f"Total size of partitions: {par_size} for file: {ts['name']}. Original data size: {len(ts['data'])}")

Total size of partitions: 229900 for file: ECG1. Original data size: 229900
Total size of partitions: 20000 for file: ECG1_20k. Original data size: 20000
Total size of partitions: 8784 for file: IOPS1. Original data size: 8784
Total size of partitions: 28479 for file: SMD1. Original data size: 28479
Total size of partitions: 2665 for file: Occupancy1. Original data size: 2665
Total size of partitions: 28784 for file: ECG1+IOPS1. Original data size: 28784
Total size of partitions: 31144 for file: SMD1+Occupancy1. Original data size: 31144
Total size of partitions: 31449 for file: ECG1+IOPS1+Occupancy1. Original data size: 31449
Total size of partitions: 51144 for file: SMD1+ECG1+Occupancy1. Original data size: 51144
Total size of partitions: 59928 for file: ECG1+IOPS1+SMD1+Occupancy1. Original data size: 59928


### ***Plot TS length and number of abnormal points***

In [None]:
# Get filenames, time series lengths, and number of abnormal points
filenames = list(preprocessed_dict.keys())
time_series_lengths = [data['Time series length'] for data in preprocessed_dict.values()]
number_of_abnormal_points = [data['Number of abnormal points'] for data in preprocessed_dict.values()]

# Plot 'Time series length' and 'Number of abnormal points' for each filename
plt.figure(figsize=(10, 5))
plt.plot(filenames, time_series_lengths, marker='o', linestyle='-', color='skyblue')
plt.xlabel('Filename')
plt.ylabel('Time series length')
plt.title('Time Series Length for Each Filename')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(filenames, number_of_abnormal_points, marker='o', linestyle='-', color='lightgreen')
plt.xlabel('Filename')
plt.ylabel('Number of abnormal points')
plt.title('Number of Abnormal Points for Each Filename')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## ***Anomaly Detection***


Classification Information:
- TN: The point is normal and we predicted it is normal
- TP: The point is abnormal and we predicted it is abnormal
- FP: The point is normal and we predicted it is abnormal
- FN: The point is abnormal and we predicted it is normal

Define evaluation metrics

In [8]:
eval_metrics = ['AUC', 
                'Precision', 
                'Recall', 
                'F', 
                'Rrecall', 
                'ExistenceReward',
                'OverlapReward',
                'Rprecision',
                'Rf',
                'Precision@k',
                'R_AUC']

Define a function to colorize the cells of the dataframe results 

In [9]:
def highlight_diff(val):
    color = ''
    if val > 0:
        color = 'background-color: lightgreen'
    elif val < 0:
        color = 'background-color: lightcoral'
    return color

### ***Isolation Forest***(Original)
Non-Streaming Variant

In [8]:
modelName = 'IForest'

In [9]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    x = ts['X_data']
    clf = IForest(n_jobs=7, random_state=42)

    t0 = time()
    clf.fit(x)
    
    score = clf.decision_scores_
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))

    t1 = time()
    
    # Plot figure
    #plotFig(ts['data'], ts['label'], score, ts['slidingWindow'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [t1-t0, len(x)])

In [10]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [11]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.963406,0.208339,29.021253,229801
1,ECG1_20k,0.973288,0.66963,2.212659,19901
2,IOPS1,0.53424,0.0,2.4131,8497
3,SMD1,0.845381,0.306236,3.977942,28355
4,Occupancy1,0.871266,0.0,0.335934,2541
5,ECG1+IOPS1,0.80913,0.533485,3.239372,28685
6,SMD1+Occupancy1,0.833035,0.223404,4.365139,31020
7,ECG1+IOPS1+Occupancy1,0.882892,0.462493,3.524515,31350
8,SMD1+ECG1+Occupancy1,0.68862,0.223912,7.099129,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.651722,0.213767,6.819192,59829


In [12]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.963406 &     0.208339 & 29.021253 &             229801 \\
                  ECG1\_20k & 0.973288 &     0.669630 &  2.212659 &              19901 \\
                     IOPS1 & 0.534240 &     0.000000 &  2.413100 &               8497 \\
                      SMD1 & 0.845381 &     0.306236 &  3.977942 &              28355 \\
                Occupancy1 & 0.871266 &     0.000000 &  0.335934 &               2541 \\
                ECG1+IOPS1 & 0.809130 &     0.533485 &  3.239372 &              28685 \\
           SMD1+Occupancy1 & 0.833035 &     0.223404 &  4.365139 &              31020 \\
     ECG1+IOPS1+Occupancy1 & 0.882892 &     0.462493 &  3.524515 &              31350 \\
      SMD1+ECG1+Occupancy1 & 0.688620 &     0.223912 &  7.099129 &              51020 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.651722 &     0.213767 &  6.819192 &  

In [None]:
iforest_res.to_csv('Results/Isolation-Forest/IForest_Non-Streaming.csv', index=False)

### ***Isolation Forest***(Variant 1)
Naive Streaming Variant

In [15]:
modelName = 'IForest'

In [16]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    clf = IForest(n_jobs=7, random_state=42)
    x = ts['X_data']
    total_time = 0

    score = []
    #for par in range(n):
    for batch in ts['batched_X_data']:

        t0 = time()
        if len(batch) == 1:
            score.append(score[-1])
        else:
            clf.fit(batch)
            score.extend(clf.decision_scores_)
            t1 = time()

            total_time += t1 - t0
      
    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))

    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, len(x)])

In [17]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [18]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.855992,0.142573,229.720105,229801
1,ECG1_20k,0.858151,0.314074,19.923992,19901
2,IOPS1,0.506936,0.004854,9.60157,8497
3,SMD1,0.367254,0.001856,29.829531,28355
4,Occupancy1,0.711537,0.064815,2.68393,2541
5,ECG1+IOPS1,0.67554,0.0,29.455487,28685
6,SMD1+Occupancy1,0.503673,0.031642,31.824658,31020
7,ECG1+IOPS1+Occupancy1,0.758577,0.03184,31.726269,31350
8,SMD1+ECG1+Occupancy1,0.645449,0.03202,52.913753,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.598401,0.016494,61.405559,59829


In [19]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &       Time &  Number of Windows \\
\midrule
                      ECG1 & 0.855992 &     0.142573 & 229.720105 &             229801 \\
                  ECG1\_20k & 0.858151 &     0.314074 &  19.923992 &              19901 \\
                     IOPS1 & 0.506936 &     0.004854 &   9.601570 &               8497 \\
                      SMD1 & 0.367254 &     0.001856 &  29.829531 &              28355 \\
                Occupancy1 & 0.711537 &     0.064815 &   2.683930 &               2541 \\
                ECG1+IOPS1 & 0.675540 &     0.000000 &  29.455487 &              28685 \\
           SMD1+Occupancy1 & 0.503673 &     0.031642 &  31.824658 &              31020 \\
     ECG1+IOPS1+Occupancy1 & 0.758577 &     0.031840 &  31.726269 &              31350 \\
      SMD1+ECG1+Occupancy1 & 0.645449 &     0.032020 &  52.913753 &              51020 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.598401 &     0.016494 &  61

### ***Isolation Forest***(Variant 2)
Streaming variant with batch history

In [22]:
modelName = 'IForest'

In [23]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    scores = []
    previous_scores = None
    x = ts['X_data']
    clf = IForest(n_jobs=7, random_state=42)
    total_time = 0
    
    for i, _ in enumerate(ts['batched_X_data']):
        
        if i == 0:
            X_train = ts['batched_X_data'][i]
        else:
            X_train = np.concatenate((ts['batched_X_data'][i-1], ts['batched_X_data'][i]))
        
        t0 = time()
        clf.fit(X_train)
        score = clf.decision_scores_

        if i > 0:
            previous_partition_length = len(ts['batched_X_data'][i-1])
            new_previous_scores = score[:previous_partition_length]
            mean_previous_scores = (previous_scores + new_previous_scores) / 2
            scores[-previous_partition_length:] = mean_previous_scores.tolist()

        current_partition_length = len(ts['batched_X_data'][i])
        current_scores = score[-current_partition_length:]
        scores.extend(current_scores)

        previous_scores = current_scores

        t1 = time()

        total_time += t1 - t0
    
    
    scores = np.array(scores)
    scores = MinMaxScaler(feature_range=(0, 1)).fit_transform(scores.reshape(-1, 1)).ravel()
    scores = np.array([scores[0]] * math.ceil((ts['slidingWindow']-1)/2) +
                         list(scores) +
                         [scores[-1]] * ((ts['slidingWindow']-1)//2))
    

    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], scores, ts['slidingWindow'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, len(x)])

In [24]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [25]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.83255,0.155556,246.065436,229801
1,ECG1_20k,0.831822,0.371852,21.628578,19901
2,IOPS1,0.48198,0.0,10.946975,8497
3,SMD1,0.3207,0.0,31.679663,28355
4,Occupancy1,0.763412,0.065844,2.83571,2541
5,ECG1+IOPS1,0.668095,0.0,31.380143,28685
6,SMD1+Occupancy1,0.478901,0.039825,34.411816,31020
7,ECG1+IOPS1+Occupancy1,0.761818,0.031301,33.864426,31350
8,SMD1+ECG1+Occupancy1,0.623556,0.041465,56.555439,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.574619,0.017594,64.74837,59829


In [26]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &       Time &  Number of Windows \\
\midrule
                      ECG1 & 0.832550 &     0.155556 & 246.065436 &             229801 \\
                  ECG1\_20k & 0.831822 &     0.371852 &  21.628578 &              19901 \\
                     IOPS1 & 0.481980 &     0.000000 &  10.946975 &               8497 \\
                      SMD1 & 0.320700 &     0.000000 &  31.679663 &              28355 \\
                Occupancy1 & 0.763412 &     0.065844 &   2.835710 &               2541 \\
                ECG1+IOPS1 & 0.668095 &     0.000000 &  31.380143 &              28685 \\
           SMD1+Occupancy1 & 0.478901 &     0.039825 &  34.411816 &              31020 \\
     ECG1+IOPS1+Occupancy1 & 0.761818 &     0.031301 &  33.864426 &              31350 \\
      SMD1+ECG1+Occupancy1 & 0.623556 &     0.041465 &  56.555439 &              51020 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.574619 &     0.017594 &  64

### ***Isolation Forest***(Variant 3)
Dynamic partitioning and classification based on ensemblers

In [8]:
modelName = 'IForest'

Evaluates the last p points of the previous partition with both classifiers and replaces scores if they disagree.

In [43]:
results = []
clf = IForest(n_jobs=7, random_state=42)
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    window_size = ts['global_sliding_window']
    x = ts['data']
    
    k = 1
    score = []
    t0 = time()
    for i, batch in enumerate(ts['data partitions']):
        
        # If there are not enough points to do at least a window, pad with the last score.
        if len(batch) < window_size:
            score_ = [score[-1]] * len(batch)
        else:
            X_train = Window(window=window_size).convert(batch).to_numpy()
            clf.fit(X_train)
            score_ = clf.decision_scores_

            # Because batches are being split in a way that doesn't allow windows to cross batch boundaries, pad predictions to account for lost
            # windows.
            score_ = np.array([score_[0]]*math.ceil((ts['global_sliding_window']-1)/2) + list(score_) + [score_[-1]]*((ts['global_sliding_window']-1)//2))

        score.extend(score_)
    t1 = time()

    # In some combinations of batch size and window size, windows overlap with all closest-distance candidates and cannot be scored.
    # In this case, inf is returned. To fix this, any instances of infinite distances are replaced with zero distance.
    score = [s if s != np.inf else 0 for s in score]
    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    # score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
    
    L = printResult(ts['data'], ts['label'], score, ts['global_sliding_window'], ts['name'], modelName)
    results.append([filename] + L + [t1-t0, len(x)])

In [44]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [45]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.814735,0.135513,74.870748,229900
1,ECG1_20k,0.776898,0.32,6.752509,20000
2,IOPS1,0.53003,0.0,3.23299,8784
3,SMD1,0.492151,0.024128,8.748321,28479
4,Occupancy1,0.780974,0.0,0.767657,2665
5,ECG1+IOPS1,0.652917,0.0,9.719672,28784
6,SMD1+Occupancy1,0.595549,0.069831,9.573527,31144
7,ECG1+IOPS1+Occupancy1,0.767134,0.081489,10.324798,31449
8,SMD1+ECG1+Occupancy1,0.688721,0.058051,16.063449,51144
9,ECG1+IOPS1+SMD1+Occupancy1,0.624106,0.022872,19.405434,59928


In [46]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.814735 &     0.135513 & 74.870748 &             229900 \\
                  ECG1\_20k & 0.776898 &     0.320000 &  6.752509 &              20000 \\
                     IOPS1 & 0.530030 &     0.000000 &  3.232990 &               8784 \\
                      SMD1 & 0.492151 &     0.024128 &  8.748321 &              28479 \\
                Occupancy1 & 0.780974 &     0.000000 &  0.767657 &               2665 \\
                ECG1+IOPS1 & 0.652917 &     0.000000 &  9.719672 &              28784 \\
           SMD1+Occupancy1 & 0.595549 &     0.069831 &  9.573527 &              31144 \\
     ECG1+IOPS1+Occupancy1 & 0.767134 &     0.081489 & 10.324798 &              31449 \\
      SMD1+ECG1+Occupancy1 & 0.688721 &     0.058051 & 16.063449 &              51144 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.624106 &     0.022872 & 19.405434 &  

### ***Isolation Forest***(Tuning)

In [10]:
# Load the data for the evaluation.
all_data = []

with open('dataset.pkl', 'rb') as f:
    data = pkl.load(f)

all_data.extend(data['evaluation']['single_normality'])
all_data.extend(data['evaluation']['double_normality'])
all_data.extend(data['evaluation']['triple_normality'])
all_data.extend(data['evaluation']['quadruple_normality'])
name_to_eval_series = {ts['Name']:ts for ts in all_data}

tuning_data = []
tuning_data.extend(data['tuning']['single_normality'])
tuning_data.extend(data['tuning']['double_normality'])
tuning_data.extend(data['tuning']['triple_normality'])
tuning_data.extend(data['tuning']['quadruple_normality'])
name_to_tune_series = {ts['Name']:ts for ts in tuning_data}

# Set the number of windows to be fit per batch.
windows_per_batch = 150

def preprocess_series(series, slidingWindow=None, verbose=True):
    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    if slidingWindow is None:
        slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    # Take the series and batch it.
    batched_data = []

    i = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data.append(data[batch_samples_begin:batch_samples_end])

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1


    # Take the windows and batch them.
    batched_X_data = []
    i = 0
    while i < len(X_data):
        begin = i
        end = i + windows_per_batch
        if end > len(X_data):
            end = len(X_data)

        batched_X_data.append(X_data[begin:end])
        i += windows_per_batch


    if verbose:
        print(f'Time-Series name: {name}')
        print("Estimated Subsequence length: ", slidingWindow)
        print()

    return {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'batched_X_data': batched_X_data,
        'batched_data': batched_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

In [12]:
from collections import defaultdict

results = []

# Parameters for tuning.
param_grid = {
    # Using the estimated window length from the autocorrelation, define alternate window sized as fractions/multiples of that.
    #'window_length_modifier': [0.5, 1.0, 2.0], 
    'window_length_modifier': [1.0],
    # Number of trees in the isolation forest.
    #'n_estimators': [50, 100, 200],
    'n_estimators': [100]
}

params_to_AUC = defaultdict(dict)

total = np.product([len(pl) for pl in param_grid.values()])

for timeseries in tuning_data:
    name = timeseries['Name']

    default_sliding_window = find_length(timeseries['data'])
    
    c = 0
    best_AUC = 0
    # Initial Best parameters are the defaults.
    best_params = (1, 100)
    for window_length_modifier in param_grid['window_length_modifier']:
        for n_estimators in param_grid['n_estimators']:
            # Prevent too small windows.
            window_size = max(10, int(window_length_modifier * default_sliding_window))

            ts = preprocess_series(series=timeseries, verbose=False)

            x = ts['X_data']
            clf = IForest(n_jobs=7, random_state=42, n_estimators=n_estimators)
            total_time = 0
            scores = []

            for i, _ in enumerate(ts['batched_X_data']):
                
                if i == 0:
                    X_train = ts['batched_X_data'][i]
                else:
                    X_train = np.concatenate((ts['batched_X_data'][i-1], ts['batched_X_data'][i]))
                
                t0 = time()
                clf.fit(X_train)
                score = clf.decision_scores_

                if i > 0:
                    previous_partition_length = len(ts['batched_X_data'][i-1])
                    new_previous_scores = score[:previous_partition_length]
                    mean_previous_scores = (previous_scores + new_previous_scores) / 2
                    scores[-previous_partition_length:] = mean_previous_scores.tolist()

                current_partition_length = len(ts['batched_X_data'][i])
                current_scores = score[-current_partition_length:]
                scores.extend(current_scores)

                previous_scores = current_scores

                t1 = time()

                total_time += t1 - t0

            score = np.array(scores)
            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
            score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
            
            AUC = printResult(ts['data'], ts['label'], score, window_size, ts['name'], modelName)[0]

            params_to_AUC[name][(window_length_modifier, n_estimators)] = AUC

            if AUC > best_AUC:
                best_AUC = AUC
                best_params = (window_length_modifier, n_estimators)

            c+=1
            print(f"\r[{c}/{total}]{name}  --  Best AUC = {best_AUC} for: {best_params}", end='')
    print()
    print(f"{name}  --  Best AUC = {best_AUC} for: {best_params}")

    # Evaluate evaluation time series with selected parameters.    
    window_size = max(10, int(ts['slidingWindow'] * best_params[0]))
    n_estimators = best_params[1]

    eval_series_name = ''.join([n if n!='2' else '1' for n in name]).replace('10k', '20k')  # Replace 2s with 1s and fix 20k becoming 10k accidentally.
    ts = preprocess_series(series=timeseries, slidingWindow=window_size, verbose=False)

    x = ts['X_data']
    clf = IForest(n_jobs=10, random_state=42, n_estimators=n_estimators)
    total_time = 0
    scores = []

    for i, _ in enumerate(ts['batched_X_data']):
        
        if i == 0:
            X_train = ts['batched_X_data'][i]
        else:
            X_train = np.concatenate((ts['batched_X_data'][i-1], ts['batched_X_data'][i]))
        
        t0 = time()
        clf.fit(X_train)
        score = clf.decision_scores_

        if i > 0:
            previous_partition_length = len(ts['batched_X_data'][i-1])
            new_previous_scores = score[:previous_partition_length]
            mean_previous_scores = (previous_scores + new_previous_scores) / 2
            scores[-previous_partition_length:] = mean_previous_scores.tolist()

        current_partition_length = len(ts['batched_X_data'][i])
        current_scores = score[-current_partition_length:]
        scores.extend(current_scores)

        previous_scores = current_scores

        t1 = time()

        total_time += t1 - t0

    score = np.array(scores)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
    
    L = printResult(ts['data'], ts['label'], score, window_size, ts['name'], modelName)
    print(f"{eval_series_name}  --  Eval AUC = {L[0]}")
    results.append([name] + L + [t1-t0, len(x)])

    print()
    print('----------------------------------------------------------------')


[1/1]ECG2  --  Best AUC = 0.7419731328320758 for: (1.0, 100)
ECG2  --  Best AUC = 0.7419731328320758 for: (1.0, 100)
ECG1  --  Eval AUC = 0.7419731328320758

----------------------------------------------------------------
[1/1]ECG2_20k  --  Best AUC = 0.7202198089139109 for: (1.0, 100)
ECG2_20k  --  Best AUC = 0.7202198089139109 for: (1.0, 100)
ECG1_20k  --  Eval AUC = 0.7202198089139109

----------------------------------------------------------------
[1/1]IOPS2  --  Best AUC = 0.3884730569214269 for: (1.0, 100)
IOPS2  --  Best AUC = 0.3884730569214269 for: (1.0, 100)
IOPS1  --  Eval AUC = 0.3884730569214269

----------------------------------------------------------------
[1/1]SMD2  --  Best AUC = 0.22131571980567422 for: (1.0, 100)
SMD2  --  Best AUC = 0.22131571980567422 for: (1.0, 100)
SMD1  --  Eval AUC = 0.22131571980567422

----------------------------------------------------------------
[1/1]Occupancy2  --  Best AUC = 0.7555554340190423 for: (1.0, 100)
Occupancy2  --  Best AU

In [19]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [20]:
iforest_res

Unnamed: 0,Name,AUC,Precision,Recall,F-score,Range-recall,ExistenceReward,OverlapReward,Range-precision,Range-Fscore,Precision@k,RangeAUC,Time,Number of Windows
0,ECG2,0.741973,0.827669,0.094139,0.16905,0.074426,0.268456,0.025919,0.768221,0.135705,0.094139,0.822504,0.150624,229812
1,ECG2_20k,0.72022,0.693431,0.136006,0.227409,0.141547,0.444444,0.065822,0.504284,0.221048,0.136006,0.794067,0.165808,19912
2,IOPS2,0.388473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.732412,0.185493,8497
3,SMD2,0.221316,0.103448,0.001114,0.002203,0.025542,0.125,0.000677,0.066667,0.036933,0.001114,0.217048,0.159643,28355
4,Occupancy2,0.755555,1.0,0.065844,0.123552,0.03304,0.071429,0.023443,1.0,0.063967,0.065844,0.826117,0.169454,2541
5,ECG2+IOPS2,0.545867,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.810079,0.159625,28696
6,SMD2+Occupancy2,0.410253,0.805195,0.033824,0.064921,0.118653,0.318182,0.068771,0.684028,0.202228,0.033824,0.403234,0.161463,31020
7,ECG2+IOPS2+Occupancy2,0.658285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.809885,0.157182,31361
8,SMD2+ECG2+Occupancy2,0.500344,0.7,0.038712,0.073367,0.092183,0.225,0.058979,0.634734,0.160986,0.038712,0.526714,0.161667,51020
9,ECG2+IOPS2+SMD2+Occupancy2,0.476591,0.588235,0.001932,0.003851,0.012341,0.046154,0.003888,0.380952,0.023908,0.001932,0.629179,0.170277,59840


In [None]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(name_to_eval_series[x]['labels']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Number of anomalies', 'Time', 'Number of Windows']]

In [None]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))