# ***Libraries***

In [1]:
import math
import os
import sys
import json
from pathlib import Path
from time import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle as pkl
from tqdm import tqdm


from sklearn.preprocessing import MinMaxScaler
from CUSUM import CUSUM
from tqdm.notebook import tqdm_notebook as tqdm

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [3]:
sys.path.append(parent_dir)

In [4]:
from TSB_UAD.models.distance import Fourier
from TSB_UAD.models.feature import Window
from TSB_UAD.utils.slidingWindows import find_length, plotFig, printResult

from TSB_UAD.models.iforest import IForest

# ***Non-Streaming Methods***
We pick non-streaming methods of our choice as a baseline. We experiment on the generated dataset of time-series. In the non-streaming setting we use the entire generated files as input. We later modify the methods to operate in a streaming setting.


## ***Data Pre-Processing***

### ***Dataset A***

The next two code cells are used to load the dataset with time-series from the following domains:
- Occupancy
- SensorScope
- NAB
- NASA-MSL
- SMD
- YAHOO

In [87]:
# Load the Time-Series dictionary
with open('Time-Series_Data_Dictionaries/Time-Series-Random-Data-of-Interest-Dictionary.json', 'r') as json_file:
    loaded_dict = json.load(json_file)

Let's see some info about the generated files

In [88]:
for filename, info in loaded_dict.items():
    print(f'{filename}: {info}')

ts1: ['Normality_1', 'SensorScope']
ts3: ['Normality_1', 'NASA-MSL']
ts4: ['Normality_1', 'YAHOO']
ts5: ['Normality_1', 'SMD']
ts8: ['Normality_1', 'SMD']
ts2: ['Normality_2', 'SensorScope', 'NAB']
ts9: ['Normality_2', 'Occupancy', 'NASA-MSL']
ts6: ['Normality_3', 'SensorScope', 'YAHOO', 'NASA-MSL']
ts7: ['Normality_3', 'YAHOO', 'NASA-MSL', 'SMD']


### ***Dataset B***

Use the next cell to load data from other domains

In [5]:
# Load the data for the evaluation.
all_data = []

with open('dataset.pkl', 'rb') as f:
    data = pkl.load(f)

all_data.extend(data['evaluation']['single_normality'])
all_data.extend(data['evaluation']['double_normality'])
all_data.extend(data['evaluation']['triple_normality'])
all_data.extend(data['evaluation']['quadruple_normality'])

In [48]:
preprocessed_dict = {}

### ***Pre-processing for non-streaming methods***
Simple data pre-processing based on TSB-UAD. This pre-processing serves as the pre-processing baseline

Preprocess first dataset

In [9]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:

    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]

    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

### ***Pre-processing for naive streaming variants of non-streaming methods***
In a streaming setting we are unaware of the data size. Thus, we have to define batches/artitions of some points. In this simple naive variant, we assume we know the data size in order to properly define the number of points each partition will have. In other words, we naively say that each batch/partition will be have a specifc number of points.

In [41]:
n = 5 # Number of partitions

In [None]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:

    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]

    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data) # The sliding window calculated for all data

    data_partitions = []
    sliding_windows = []
    x_data_partitions = []
    x_data_partitions_len = 0

    # Divide data based on n and add the remaining points to the last partition
    partition_size = int(math.floor(len(data) / n))
    remaining_points = len(data) - (partition_size * n)

    for par in range(n):
        start_idx = par * partition_size
        end_idx = start_idx + partition_size
        if par == n - 1:  
            end_idx += remaining_points
        data_partitions.append(data[start_idx:end_idx])
    
    # For each partition created, calculate the sliding window 
    for partition in data_partitions:
        slidingWindow = find_length(partition)
        sliding_windows.append(slidingWindow)

        X_data = Window(window=slidingWindow).convert(partition).to_numpy()
        x_data_partitions.append(X_data)
        x_data_partitions_len += len(X_data)

    
    preprocessed_dict[filename] = {
        'name': name,
        'data': data,
        'label': label,
        'global_sliding_window': global_sw,
        'slidingWindow': sliding_windows,
        'X_data': x_data_partitions,
        'X_data Length': x_data_partitions_len,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

### ***Pre-processing for variations of naive streaming variants of non-streaming methods***
Here, we partition data naively as before

In [10]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:

    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]

    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data)

    data_partitions = []

    partition_size = int(math.floor(len(data) / n))
    remaining_points = len(data) - (partition_size * n)

    for par in range(n):
        start_idx = par * partition_size
        end_idx = start_idx + partition_size
        if par == n - 1:  
            end_idx += remaining_points
        data_partitions.append(data[start_idx:end_idx])
    
    preprocessed_dict[filename] = {
        'name': name,
        'data': data,
        'label': label,
        'data partitions': data_partitions,
        'global_sliding_window': global_sw,
    }

### ***Pre-processing for variations of naive streaming variants of non-streaming methods***(Dynamic partitioning)
Naively partitioning the data is not a reliable solution. We want to partition the data as soon as an abrupt change occurs. For that, we can use:
- 1. MinMax range partitioning

#### ***Dynamic Partitioning***(Variant 1)

In [49]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    
    # === Pre-processing steps ===

    # Prepare data for unsupervised methods
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]

    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data)

    initial_partition_length = int(len(data) * 0.25)
    initial_partition = data[:initial_partition_length]

    max = np.max(initial_partition)
    min = np.min(initial_partition)

    data_partitions = [initial_partition]
    current_partition = []
    change_detected = False

    p = 500
    change_point_threshold = 0.5
    exceed_threshold = 0.65
    post_change_points = []

    for point in data[initial_partition_length:]:
        
        # Check for significant change
        if (point > max * (1 + change_point_threshold)) or (point < min * (1 - change_point_threshold)):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt > max * (1 + change_point_threshold) or pt < min * (1 - change_point_threshold)) for pt in post_change_points]
                if sum(exceeds_threshold_points) >= exceed_threshold * p:
                    max = np.mean([max] + [pt for pt in post_change_points if pt > max])
                    min = np.mean([min] + [pt for pt in post_change_points if pt < min])

                post_change_points = []

                # Add the current partition to data partitions
                data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to data_partitions
    if current_partition:
        data_partitions.append(np.array(current_partition))

    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'data partitions': data_partitions,
        'global_sliding_window': global_sw,
    }

Let's see the number of partitions created for each time-series

In [39]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    print(f"Number of partitions: {len(ts['data partitions'])} for file: {ts['name']}")

Number of partitions: 310 for file: ECG1
Number of partitions: 29 for file: ECG1_20k
Number of partitions: 2 for file: IOPS1
Number of partitions: 5 for file: SMD1
Number of partitions: 2 for file: Occupancy1
Number of partitions: 26 for file: ECG1+IOPS1
Number of partitions: 6 for file: SMD1+Occupancy1
Number of partitions: 26 for file: ECG1+IOPS1+Occupancy1
Number of partitions: 45 for file: SMD1+ECG1+Occupancy1
Number of partitions: 13 for file: ECG1+IOPS1+SMD1+Occupancy1


Are the size of the partitions consistent with the initial size of the time-series?

In [40]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    par_size = 0
    for partition in ts['data partitions']:
        par_size += len(partition)
    
    print(f"Total size of partitions: {par_size} for file: {ts['name']}. Original data size: {len(ts['data'])}")

Total size of partitions: 229900 for file: ECG1. Original data size: 229900
Total size of partitions: 20000 for file: ECG1_20k. Original data size: 20000
Total size of partitions: 8784 for file: IOPS1. Original data size: 8784
Total size of partitions: 28479 for file: SMD1. Original data size: 28479
Total size of partitions: 2665 for file: Occupancy1. Original data size: 2665
Total size of partitions: 28784 for file: ECG1+IOPS1. Original data size: 28784
Total size of partitions: 31144 for file: SMD1+Occupancy1. Original data size: 31144
Total size of partitions: 31449 for file: ECG1+IOPS1+Occupancy1. Original data size: 31449
Total size of partitions: 51144 for file: SMD1+ECG1+Occupancy1. Original data size: 51144
Total size of partitions: 59928 for file: ECG1+IOPS1+SMD1+Occupancy1. Original data size: 59928


In [None]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    if len(ts['data partitions']) < 10:

        fig, axes = plt.subplots(1, len(ts['data partitions']), figsize=(20, 5))

        for i, array in enumerate(ts['data partitions']):
            axes[i].plot(array)
            axes[i].set_title(f"Partition {i+1} of {ts['name']}")

        plt.tight_layout()
        plt.show()

### ***Plot TS length and number of abnormal points***

In [None]:
# Get filenames, time series lengths, and number of abnormal points
filenames = list(preprocessed_dict.keys())
time_series_lengths = [data['Time series length'] for data in preprocessed_dict.values()]
number_of_abnormal_points = [data['Number of abnormal points'] for data in preprocessed_dict.values()]

# Plot 'Time series length' and 'Number of abnormal points' for each filename
plt.figure(figsize=(10, 5))
plt.plot(filenames, time_series_lengths, marker='o', linestyle='-', color='skyblue')
plt.xlabel('Filename')
plt.ylabel('Time series length')
plt.title('Time Series Length for Each Filename')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(filenames, number_of_abnormal_points, marker='o', linestyle='-', color='lightgreen')
plt.xlabel('Filename')
plt.ylabel('Number of abnormal points')
plt.title('Number of Abnormal Points for Each Filename')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## ***Anomaly Detection***


Classification Information:
- TN: The point is normal and we predicted it is normal
- TP: The point is abnormal and we predicted it is abnormal
- FP: The point is normal and we predicted it is abnormal
- FN: The point is abnormal and we predicted it is normal

Define evaluation metrics

In [8]:
eval_metrics = ['AUC', 
                'Precision', 
                'Recall', 
                'F', 
                'Rrecall', 
                'ExistenceReward',
                'OverlapReward',
                'Rprecision',
                'Rf',
                'Precision@k',
                'R_AUC']

Define a function to colorize the cells of the dataframe results 

In [9]:
def highlight_diff(val):
    color = ''
    if val > 0:
        color = 'background-color: lightgreen'
    elif val < 0:
        color = 'background-color: lightcoral'
    return color

### ***Isolation Forest***(Original)
Non-Streaming Variant

In [10]:
modelName = 'IForest'

In [36]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    x = ts['X_data']
    clf = IForest(n_jobs=7, random_state=42)

    t0 = time()
    clf.fit(x)
    
    
    score = clf.decision_scores_
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))

    t1 = time()
    
    # Plot figure
    #plotFig(ts['data'], ts['label'], score, ts['slidingWindow'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [t1-t0, len(x)])

In [22]:
# Use for Dataset A
columns = ['Filename'] + eval_metrics
iforest_res = pd.DataFrame(results, columns=columns)

In [37]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [38]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.963406,0.208339,28.327885,229801
1,ECG1_20k,0.973288,0.66963,2.175788,19901
2,IOPS1,0.53424,0.0,2.343871,8497
3,SMD1,0.845381,0.306236,3.86206,28355
4,Occupancy1,0.871266,0.0,0.331349,2541
5,ECG1+IOPS1,0.80913,0.533485,3.188019,28685
6,SMD1+Occupancy1,0.833035,0.223404,4.172662,31020
7,ECG1+IOPS1+Occupancy1,0.882892,0.462493,3.424442,31350
8,SMD1+ECG1+Occupancy1,0.68862,0.223912,6.872895,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.651722,0.213767,6.74973,59829


In [39]:
print(df[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.963406 &     0.208339 & 28.327885 &             229801 \\
                  ECG1\_20k & 0.973288 &     0.669630 &  2.175788 &              19901 \\
                     IOPS1 & 0.534240 &     0.000000 &  2.343871 &               8497 \\
                      SMD1 & 0.845381 &     0.306236 &  3.862060 &              28355 \\
                Occupancy1 & 0.871266 &     0.000000 &  0.331349 &               2541 \\
                ECG1+IOPS1 & 0.809130 &     0.533485 &  3.188019 &              28685 \\
           SMD1+Occupancy1 & 0.833035 &     0.223404 &  4.172662 &              31020 \\
     ECG1+IOPS1+Occupancy1 & 0.882892 &     0.462493 &  3.424442 &              31350 \\
      SMD1+ECG1+Occupancy1 & 0.688620 &     0.223912 &  6.872895 &              51020 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.651722 &     0.213767 &  6.749730 &  

In [None]:
# Use for Dataset A
iforest_res

In [27]:
df.to_csv('Results/Isolation-Forest/Second_Dataset/IForest_Non-Streaming.csv', index=False)

### ***Isolation Forest***(Variant 1)
Naive Streaming Variant

In [22]:
modelName = 'IForest'

In [43]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    clf = IForest(n_jobs=7, random_state=42)
    total_time = 0

    scores = []
    for par in range(n):

        t0 = time()
        clf.fit(ts['X_data'][par])

        score = clf.decision_scores_
    
        score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
        score = np.array([score[0]]*math.ceil((ts['slidingWindow'][par]-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow'][par]-1)//2))
        
        scores.extend(score)
        t1 = time()

        total_time += t1 - t0
    
    
    scores = np.array(scores)
    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], scores, ts['global_sliding_window'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, ts['X_data Length']])

In [27]:
# Use for Dataset A
columns = ['Filename'] + eval_metrics
iforest_res = pd.DataFrame(results, columns=columns)

In [45]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [46]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.940874,0.23023,29.694501,229331
1,ECG1_20k,0.910233,0.265185,2.523816,19309
2,IOPS1,0.537252,0.0,1.843353,7349
3,SMD1,0.520366,0.063474,1.894012,28328
4,Occupancy1,0.645294,0.0,0.875701,1756
5,ECG1+IOPS1,0.779123,0.114642,4.226586,28002
6,SMD1+Occupancy1,0.621276,0.038734,2.884676,30761
7,ECG1+IOPS1+Occupancy1,0.789963,0.082029,4.902659,30629
8,SMD1+ECG1+Occupancy1,0.776839,0.076019,6.240047,50572
9,ECG1+IOPS1+SMD1+Occupancy1,0.731821,0.056961,8.948484,59218


In [47]:
print(df[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.940874 &     0.230230 & 29.694501 &             229331 \\
                  ECG1\_20k & 0.910233 &     0.265185 &  2.523816 &              19309 \\
                     IOPS1 & 0.537252 &     0.000000 &  1.843353 &               7349 \\
                      SMD1 & 0.520366 &     0.063474 &  1.894012 &              28328 \\
                Occupancy1 & 0.645294 &     0.000000 &  0.875701 &               1756 \\
                ECG1+IOPS1 & 0.779123 &     0.114642 &  4.226586 &              28002 \\
           SMD1+Occupancy1 & 0.621276 &     0.038734 &  2.884676 &              30761 \\
     ECG1+IOPS1+Occupancy1 & 0.789963 &     0.082029 &  4.902659 &              30629 \\
      SMD1+ECG1+Occupancy1 & 0.776839 &     0.076019 &  6.240047 &              50572 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.731821 &     0.056961 &  8.948484 &  

#### ***Only for Dataset A***

In [None]:
iforest_res

In [29]:
iforest_res.to_csv('Results/Isolation-Forest/IForest_Streaming_Naive_Variant.csv', index=False)

In [35]:
iforest_orig_res = pd.read_csv('Results/Isolation-Forest/IForest_Non-Streaming.csv')
iforest_stream_var1_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Naive_Variant.csv')

filenames_col = iforest_orig_res.iloc[:,0]

iforest_orig_res = iforest_orig_res.iloc[:, 1:]
iforest_stream_var1_res = iforest_stream_var1_res.iloc[:, 1:]

In [None]:
res_diff =  iforest_stream_var1_res - iforest_orig_res

res_diff.insert(0, 'Filename', filenames_col)

res_diff = res_diff.style.applymap(highlight_diff, subset=pd.IndexSlice[:, res_diff.columns[1:]])

res_diff

### ***Isolation Forest***(Variant 2)
Streaming variant with batch history

In [29]:
modelName = 'IForest'

In [30]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    scores = []
    previous_scores = None
    x_data_partitions_len = 0
    clf = IForest(n_jobs=7, random_state=42)
    total_time = 0
    
    for par in range(n):

        if par == 0:
            partition = ts['data partitions'][par]
            slidingWindow = find_length(partition)
            X_train = Window(window=slidingWindow).convert(partition).to_numpy()
        else:
            partition_with_history = np.concatenate((ts['data partitions'][par-1], ts['data partitions'][par]))
            slidingWindow = find_length(partition_with_history)
            X_train = Window(window=slidingWindow).convert(partition_with_history).to_numpy()
        
        x_data_partitions_len += len(X_train)

        t0 = time()
        clf.fit(X_train)
        

        score = clf.decision_scores_

        score = MinMaxScaler(feature_range=(0, 1)).fit_transform(score.reshape(-1, 1)).ravel()
        score = np.array([score[0]] * math.ceil((slidingWindow-1)/2) +
                         list(score) +
                         [score[-1]] * ((slidingWindow-1)//2))

        if par > 0:
            previous_partition_length = len(ts['data partitions'][par-1])
            new_previous_scores = score[:previous_partition_length]
            mean_previous_scores = (previous_scores + new_previous_scores) / 2
            scores[-previous_partition_length:] = mean_previous_scores.tolist()

        current_partition_length = len(ts['data partitions'][par])
        current_scores = score[-current_partition_length:]
        scores.extend(current_scores)

        previous_scores = current_scores

        t1 = time()

        total_time += t1 - t0
    
    
    scores = np.array(scores)
    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], scores, ts['global_sliding_window'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, x_data_partitions_len])

In [31]:
# Use for Dataset A
columns = ['Filename'] + eval_metrics
iforest_res = pd.DataFrame(results, columns=columns)

In [31]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [32]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.960342,0.223407,57.275025,413249
1,ECG1_20k,0.982224,0.515556,4.255884,35406
2,IOPS1,0.569987,0.009709,3.729669,14373
3,SMD1,0.713479,0.093541,4.671571,50872
4,Occupancy1,0.92374,0.0,1.030245,4177
5,ECG1+IOPS1,0.796599,0.447219,7.624744,51116
6,SMD1+Occupancy1,0.770184,0.04719,6.253056,55555
7,ECG1+IOPS1+Occupancy1,0.888407,0.308689,8.716757,55888
8,SMD1+ECG1+Occupancy1,0.813415,0.218613,11.502622,91483
9,ECG1+IOPS1+SMD1+Occupancy1,0.781739,0.217506,18.793891,107065


In [33]:
print(df[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.960342 &     0.223407 & 57.275025 &             413249 \\
                  ECG1\_20k & 0.982224 &     0.515556 &  4.255884 &              35406 \\
                     IOPS1 & 0.569987 &     0.009709 &  3.729669 &              14373 \\
                      SMD1 & 0.713479 &     0.093541 &  4.671571 &              50872 \\
                Occupancy1 & 0.923740 &     0.000000 &  1.030245 &               4177 \\
                ECG1+IOPS1 & 0.796599 &     0.447219 &  7.624744 &              51116 \\
           SMD1+Occupancy1 & 0.770184 &     0.047190 &  6.253056 &              55555 \\
     ECG1+IOPS1+Occupancy1 & 0.888407 &     0.308689 &  8.716757 &              55888 \\
      SMD1+ECG1+Occupancy1 & 0.813415 &     0.218613 & 11.502622 &              91483 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.781739 &     0.217506 & 18.793891 &  

#### ***Only for Dataset B***

In [None]:
iforest_res

In [33]:
iforest_res.to_csv('Results/Isolation-Forest/IForest_Streaming_Batch_History_Variant.csv', index=False)

In [34]:
iforest_orig_res = pd.read_csv('Results/Isolation-Forest/IForest_Non-Streaming.csv')
iforest_stream_var1_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Naive_Variant.csv')
iforest_stream_var2_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Batch_History_Variant.csv')

filenames_col = iforest_orig_res.iloc[:,0]

iforest_orig_res = iforest_orig_res.iloc[:, 1:]
iforest_stream_var1_res = iforest_stream_var1_res.iloc[:, 1:]
iforest_stream_var2_res = iforest_stream_var2_res.iloc[:, 1:]

In [None]:
res_diff =  iforest_stream_var2_res - iforest_stream_var1_res

res_diff.insert(0, 'Filename', filenames_col)

res_diff = res_diff.style.applymap(highlight_diff, subset=pd.IndexSlice[:, res_diff.columns[1:]])

res_diff

### ***Isolation Forest***(Variant 3)
Dynamic partitioning and classification based on ensemblers

In [47]:
modelName = 'IForest'

Evaluates the last p points of the previous partition with both classifiers and replaces scores if they disagree.

In [50]:
results = []
disagreement_threshold = 0.5

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    scores = []
    previous_scores = None
    x_data_partitions_len = 0
    clf = IForest(n_jobs=7, random_state=42)
    total_time = 0

    
    for par in range(len(ts['data partitions'])):

        if par == 0 or par == 1:
            partition = ts['data partitions'][par]
            slidingWindow = find_length(partition)
            X_train = Window(window=slidingWindow).convert(partition).to_numpy()
        else:
            previous_partition = ts['data partitions'][par-1]
            partition = ts['data partitions'][par]
            last_p_points = previous_partition[-p:]
            partition_with_history = np.concatenate((last_p_points, partition))
            slidingWindow = find_length(partition_with_history)
            X_train = Window(window=slidingWindow).convert(partition_with_history).to_numpy()
        
        x_data_partitions_len += len(X_train)

        t0 = time()
        clf.fit(X_train)
        score = clf.decision_scores_

        score = MinMaxScaler(feature_range=(0, 1)).fit_transform(score.reshape(-1, 1)).ravel()
        score = np.array([score[0]] * math.ceil((slidingWindow-1)/2) +
                        list(score) +
                        [score[-1]] * ((slidingWindow-1)//2))
        
        if par > 1:
            previous_partition_length = len(previous_partition)
            new_previous_scores = score[:p]
            prev_scores_to_compare = previous_scores[-p:]
            
            disagreement_indices = np.where(np.abs(prev_scores_to_compare - new_previous_scores) > disagreement_threshold)[0]
            if len(disagreement_indices) > p * 0.5:
                previous_scores[-p:] = new_previous_scores
            
            scores[-p:] = previous_scores[-p:]
            current_scores = score[p:]
        else:
            current_scores = score

        scores.extend(current_scores)
        previous_scores = current_scores

        t1 = time()
        total_time += t1 - t0

    
    scores = np.array(scores)
    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], scores, ts['global_sliding_window'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, x_data_partitions_len])

In [96]:
# Use for Dataset A
columns = ['Filename'] + eval_metrics
iforest_res = pd.DataFrame(results, columns=columns)

In [51]:
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
df = pd.DataFrame(results, columns=columns)

In [52]:
df['Number of anomalies'] = df['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
df[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.726683,0.007629,74.442923,342494
1,ECG1_20k,0.736483,0.007407,6.702948,30106
2,IOPS1,0.527711,0.0,2.221043,8210
3,SMD1,0.599773,0.122123,2.909586,29715
4,Occupancy1,0.861513,0.0,0.485179,2417
5,ECG1+IOPS1,0.764026,0.044268,6.173537,37882
6,SMD1+Occupancy1,0.696195,0.125477,4.302852,32638
7,ECG1+IOPS1+Occupancy1,0.796491,0.093902,7.489961,40404
8,SMD1+ECG1+Occupancy1,0.626209,0.029256,12.268638,67581
9,ECG1+IOPS1+SMD1+Occupancy1,0.735903,0.096767,15.090683,63384


In [53]:
print(df[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.726683 &     0.007629 & 74.442923 &             342494 \\
                  ECG1\_20k & 0.736483 &     0.007407 &  6.702948 &              30106 \\
                     IOPS1 & 0.527711 &     0.000000 &  2.221043 &               8210 \\
                      SMD1 & 0.599773 &     0.122123 &  2.909586 &              29715 \\
                Occupancy1 & 0.861513 &     0.000000 &  0.485179 &               2417 \\
                ECG1+IOPS1 & 0.764026 &     0.044268 &  6.173537 &              37882 \\
           SMD1+Occupancy1 & 0.696195 &     0.125477 &  4.302852 &              32638 \\
     ECG1+IOPS1+Occupancy1 & 0.796491 &     0.093902 &  7.489961 &              40404 \\
      SMD1+ECG1+Occupancy1 & 0.626209 &     0.029256 & 12.268638 &              67581 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.735903 &     0.096767 & 15.090683 &  

#### ***Only for Dataset A***

In [None]:
iforest_res

In [98]:
iforest_res.to_csv('Results/Isolation-Forest/IForest_Streaming_Dynamic_Partitioning_Variant.csv', index=False)

In [99]:
iforest_orig_res = pd.read_csv('Results/Isolation-Forest/IForest_Non-Streaming.csv')
iforest_stream_var1_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Naive_Variant.csv')
iforest_stream_var2_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Batch_History_Variant.csv')
iforest_stream_var3_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Dynamic_Partitioning_Variant.csv')


filenames_col = iforest_orig_res.iloc[:,0]

iforest_orig_res = iforest_orig_res.iloc[:, 1:]
iforest_stream_var1_res = iforest_stream_var1_res.iloc[:, 1:]
iforest_stream_var2_res = iforest_stream_var2_res.iloc[:, 1:]
iforest_stream_var3_res = iforest_stream_var3_res.iloc[:, 1:]

In [None]:
res_diff =  iforest_stream_var3_res - iforest_stream_var2_res

res_diff.insert(0, 'Filename', filenames_col)

res_diff = res_diff.style.applymap(highlight_diff, subset=pd.IndexSlice[:, res_diff.columns[1:]])

res_diff