# ***Libraries***

In [1]:
import math
import os
import sys
import json
from time import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle as pkl


from sklearn.preprocessing import MinMaxScaler

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [3]:
sys.path.append(parent_dir)

In [4]:
from TSB_UAD.models.distance import Fourier
from TSB_UAD.models.feature import Window
from TSB_UAD.utils.slidingWindows import find_length, plotFig, printResult

from TSB_UAD.models.iforest import IForest

# ***Isolation Forest***


## ***Data Pre-Processing***

### ***Dataset A***
Dataset A utlizes time-series from the following domains:
- Occupancy
- SensorScope
- NAB
- NASA-MSL
- SMD
- YAHOO

In [87]:
# Load the Time-Series dictionary
with open('Time-Series_Data_Dictionaries/Time-Series-Random-Data-of-Interest-Dictionary.json', 'r') as json_file:
    loaded_dict = json.load(json_file)

Let's see some info about the generated files

In [88]:
for filename, info in loaded_dict.items():
    print(f'{filename}: {info}')

ts1: ['Normality_1', 'SensorScope']
ts3: ['Normality_1', 'NASA-MSL']
ts4: ['Normality_1', 'YAHOO']
ts5: ['Normality_1', 'SMD']
ts8: ['Normality_1', 'SMD']
ts2: ['Normality_2', 'SensorScope', 'NAB']
ts9: ['Normality_2', 'Occupancy', 'NASA-MSL']
ts6: ['Normality_3', 'SensorScope', 'YAHOO', 'NASA-MSL']
ts7: ['Normality_3', 'YAHOO', 'NASA-MSL', 'SMD']


### ***Dataset B (Selected)***
Dataset B utilizes the following time-series:
- ECG1
- ECG1_20k
- IOPS1
- SMD1
- Occupancy1
- ECG1+IOPS1
- SMD1+Occupancy1 
- ECG1+IOPS1+Occupancy1
- SMD1+ECG1+Occupancy1
- ECG1+IOPS1+SMD1+Occupancy1

In [5]:
# Load the data for the evaluation.
all_data = []

with open('dataset.pkl', 'rb') as f:
    data = pkl.load(f)

all_data.extend(data['evaluation']['single_normality'])
all_data.extend(data['evaluation']['double_normality'])
all_data.extend(data['evaluation']['triple_normality'])
all_data.extend(data['evaluation']['quadruple_normality'])

In [33]:
preprocessed_dict = {}

### ***Pre-processing for non-streaming***
Simple data pre-processing based on TSB-UAD. This pre-processing serves as the pre-processing baseline

In [9]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

### ***Pre-processing for both naive streaming variant and streaming variant with batch history***

In [20]:
# Set the number of windows to be fit per batch.
windows_per_batch = 150

for timeseries in all_data:
    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    # Take the series and batch it.
    batched_data = []

    i = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data.append(data[batch_samples_begin:batch_samples_end])

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1

    # Take the windows and batch them.
    batched_X_data = []
    i = 0
    while i < len(X_data):
        begin = i
        end = i + windows_per_batch
        if end > len(X_data):
            end = len(X_data)

        batched_X_data.append(X_data[begin:end])
        i += windows_per_batch

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    # Store the pre-processed variables in the new dictionary
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'batched_X_data': batched_X_data,
        'batched_data': batched_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

Time-Series name: ECG1
Estimated Subsequence length:  100

Time-Series name: ECG1_20k
Estimated Subsequence length:  100

Time-Series name: IOPS1
Estimated Subsequence length:  288

Time-Series name: SMD1
Estimated Subsequence length:  125

Time-Series name: Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1
Estimated Subsequence length:  100

Time-Series name: SMD1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+Occupancy1
Estimated Subsequence length:  100

Time-Series name: SMD1+ECG1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+SMD1+Occupancy1
Estimated Subsequence length:  100



### ***Pre-processing for streaming variant with dynamic partitioning (change point detection)***
Naively partitioning the data is not a reliable solution. We want to partition the data as soon as an abrupt change occurs. For that, we can use:
- 1. MinMax range partitioning
- 2. Percentile Partitioning

#### ***MinMax range partitioning***

In [28]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data)

    initial_partition_length = int(len(data) * 0.25)
    initial_partition = data[:initial_partition_length]

    max = np.max(initial_partition)
    min = np.min(initial_partition)

    data_partitions = [initial_partition]
    current_partition = []
    change_detected = False

    p = 500
    change_point_threshold = 0.8
    exceed_threshold = 0.5
    post_change_points = []

    for point in data[len(initial_partition):]:
        
        # Check for significant change
        if (point > max * (1 + change_point_threshold)) or (point < min * (1 - change_point_threshold)):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt > max * (1 + change_point_threshold) or pt < min * (1 - change_point_threshold)) for pt in post_change_points]
                if sum(exceeds_threshold_points) >= exceed_threshold * p:
                    max = np.mean([max] + [pt for pt in post_change_points if pt > max])
                    min = np.mean([min] + [pt for pt in post_change_points if pt < min])

                post_change_points = []

                # Add the current partition to data partitions
                data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to data_partitions
    if current_partition:
        data_partitions.append(np.array(current_partition))

    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'data partitions': data_partitions,
        'global_sliding_window': global_sw,
    }

Let's see the number of partitions created for each time-series

In [29]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    print(f"Number of partitions: {len(ts['data partitions'])} for file: {ts['name']}")

Number of partitions: 414 for file: ECG1
Number of partitions: 32 for file: ECG1_20k
Number of partitions: 3 for file: IOPS1
Number of partitions: 2 for file: SMD1
Number of partitions: 2 for file: Occupancy1
Number of partitions: 33 for file: ECG1+IOPS1
Number of partitions: 3 for file: SMD1+Occupancy1
Number of partitions: 34 for file: ECG1+IOPS1+Occupancy1
Number of partitions: 42 for file: SMD1+ECG1+Occupancy1
Number of partitions: 34 for file: ECG1+IOPS1+SMD1+Occupancy1


Are the size of the partitions consistent with the initial size of the time-series?

In [30]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    par_size = 0
    for partition in ts['data partitions']:
        par_size += len(partition)
    
    print(f"Total size of partitions: {par_size} for file: {ts['name']}. Original data size: {len(ts['data'])}")

Total size of partitions: 229900 for file: ECG1. Original data size: 229900
Total size of partitions: 20000 for file: ECG1_20k. Original data size: 20000
Total size of partitions: 8784 for file: IOPS1. Original data size: 8784
Total size of partitions: 28479 for file: SMD1. Original data size: 28479
Total size of partitions: 2665 for file: Occupancy1. Original data size: 2665
Total size of partitions: 28784 for file: ECG1+IOPS1. Original data size: 28784
Total size of partitions: 31144 for file: SMD1+Occupancy1. Original data size: 31144
Total size of partitions: 31449 for file: ECG1+IOPS1+Occupancy1. Original data size: 31449
Total size of partitions: 51144 for file: SMD1+ECG1+Occupancy1. Original data size: 51144
Total size of partitions: 59928 for file: ECG1+IOPS1+SMD1+Occupancy1. Original data size: 59928


In [None]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    if len(ts['data partitions']) < 10:

        fig, axes = plt.subplots(1, len(ts['data partitions']), figsize=(20, 5))

        for i, array in enumerate(ts['data partitions']):
            axes[i].plot(array)
            axes[i].set_title(f"Partition {i+1} of {ts['name']}")

        plt.tight_layout()
        plt.show()

#### ***Percentile Partitioning***

In [34]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data)


    # Filter the normal points (label == 0)
    normal_indices = [i for i, lbl in enumerate(label) if lbl == 0]
    normal_data = data[normal_indices]

    normal_data_par_length = int(len(normal_data) * 0.10)
    normal_data = normal_data[:normal_data_par_length]

    #initial_partition_length = int(len(data) * 0.25)
    #initial_partition = data[:initial_partition_length]

    # Compute initial percentiles
    percentile_5 = np.percentile(normal_data, 5)
    percentile_95 = np.percentile(normal_data, 95)

    data_partitions = []
    current_partition = []
    change_detected = False
    p = 500
    exceed_threshold = 0.5
    post_change_points = []

    for point in data[:]:
        
        # Check for significant change
        if (point < percentile_5) or (point > percentile_95):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt < percentile_5 or pt > percentile_95) for pt in post_change_points]
                if sum(exceeds_threshold_points) / p >= exceed_threshold:
                    # Update percentiles
                    percentile_5 = np.percentile(post_change_points, 5)
                    percentile_95 = np.percentile(post_change_points, 95)

                post_change_points = []
                # Add the current partition to data partitions
                data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to data_partitions
    if current_partition:
        data_partitions.append(np.array(current_partition))

    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'data partitions': data_partitions,
        'global_sliding_window': global_sw,
    }

In [35]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    print(f"Number of partitions: {len(ts['data partitions'])} for file: {ts['name']}")

Number of partitions: 431 for file: ECG1
Number of partitions: 39 for file: ECG1_20k
Number of partitions: 16 for file: IOPS1
Number of partitions: 46 for file: SMD1
Number of partitions: 4 for file: Occupancy1
Number of partitions: 55 for file: ECG1+IOPS1
Number of partitions: 51 for file: SMD1+Occupancy1
Number of partitions: 58 for file: ECG1+IOPS1+Occupancy1
Number of partitions: 89 for file: SMD1+ECG1+Occupancy1
Number of partitions: 107 for file: ECG1+IOPS1+SMD1+Occupancy1


In [36]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    par_size = 0
    for partition in ts['data partitions']:
        par_size += len(partition)
    
    print(f"Total size of partitions: {par_size} for file: {ts['name']}. Original data size: {len(ts['data'])}")

Total size of partitions: 229900 for file: ECG1. Original data size: 229900
Total size of partitions: 20000 for file: ECG1_20k. Original data size: 20000
Total size of partitions: 8784 for file: IOPS1. Original data size: 8784
Total size of partitions: 28479 for file: SMD1. Original data size: 28479
Total size of partitions: 2665 for file: Occupancy1. Original data size: 2665
Total size of partitions: 28784 for file: ECG1+IOPS1. Original data size: 28784
Total size of partitions: 31144 for file: SMD1+Occupancy1. Original data size: 31144
Total size of partitions: 31449 for file: ECG1+IOPS1+Occupancy1. Original data size: 31449
Total size of partitions: 51144 for file: SMD1+ECG1+Occupancy1. Original data size: 51144
Total size of partitions: 59928 for file: ECG1+IOPS1+SMD1+Occupancy1. Original data size: 59928


### ***Plot TS length and number of abnormal points***

In [None]:
# Get filenames, time series lengths, and number of abnormal points
filenames = list(preprocessed_dict.keys())
time_series_lengths = [data['Time series length'] for data in preprocessed_dict.values()]
number_of_abnormal_points = [data['Number of abnormal points'] for data in preprocessed_dict.values()]

# Plot 'Time series length' and 'Number of abnormal points' for each filename
plt.figure(figsize=(10, 5))
plt.plot(filenames, time_series_lengths, marker='o', linestyle='-', color='skyblue')
plt.xlabel('Filename')
plt.ylabel('Time series length')
plt.title('Time Series Length for Each Filename')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(filenames, number_of_abnormal_points, marker='o', linestyle='-', color='lightgreen')
plt.xlabel('Filename')
plt.ylabel('Number of abnormal points')
plt.title('Number of Abnormal Points for Each Filename')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

## ***Anomaly Detection***


Classification Information:
- TN: The point is normal and we predicted it is normal
- TP: The point is abnormal and we predicted it is abnormal
- FP: The point is normal and we predicted it is abnormal
- FN: The point is abnormal and we predicted it is normal

Define evaluation metrics

In [8]:
eval_metrics = ['AUC', 
                'Precision', 
                'Recall', 
                'F', 
                'Rrecall', 
                'ExistenceReward',
                'OverlapReward',
                'Rprecision',
                'Rf',
                'Precision@k',
                'R_AUC']

Define a function to colorize the cells of the dataframe results 

In [9]:
def highlight_diff(val):
    color = ''
    if val > 0:
        color = 'background-color: lightgreen'
    elif val < 0:
        color = 'background-color: lightcoral'
    return color

### ***Isolation Forest***(Original)
Non-Streaming Variant

In [10]:
modelName = 'IForest'

In [36]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    x = ts['X_data']
    clf = IForest(n_jobs=7, random_state=42)

    t0 = time()
    clf.fit(x)
    
    score = clf.decision_scores_
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))

    t1 = time()
    
    # Plot figure
    #plotFig(ts['data'], ts['label'], score, ts['slidingWindow'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [t1-t0, len(x)])

In [37]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [38]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.963406,0.208339,28.327885,229801
1,ECG1_20k,0.973288,0.66963,2.175788,19901
2,IOPS1,0.53424,0.0,2.343871,8497
3,SMD1,0.845381,0.306236,3.86206,28355
4,Occupancy1,0.871266,0.0,0.331349,2541
5,ECG1+IOPS1,0.80913,0.533485,3.188019,28685
6,SMD1+Occupancy1,0.833035,0.223404,4.172662,31020
7,ECG1+IOPS1+Occupancy1,0.882892,0.462493,3.424442,31350
8,SMD1+ECG1+Occupancy1,0.68862,0.223912,6.872895,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.651722,0.213767,6.74973,59829


In [39]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.963406 &     0.208339 & 28.327885 &             229801 \\
                  ECG1\_20k & 0.973288 &     0.669630 &  2.175788 &              19901 \\
                     IOPS1 & 0.534240 &     0.000000 &  2.343871 &               8497 \\
                      SMD1 & 0.845381 &     0.306236 &  3.862060 &              28355 \\
                Occupancy1 & 0.871266 &     0.000000 &  0.331349 &               2541 \\
                ECG1+IOPS1 & 0.809130 &     0.533485 &  3.188019 &              28685 \\
           SMD1+Occupancy1 & 0.833035 &     0.223404 &  4.172662 &              31020 \\
     ECG1+IOPS1+Occupancy1 & 0.882892 &     0.462493 &  3.424442 &              31350 \\
      SMD1+ECG1+Occupancy1 & 0.688620 &     0.223912 &  6.872895 &              51020 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.651722 &     0.213767 &  6.749730 &  

In [None]:
iforest_res.to_csv('Results/Isolation-Forest/IForest_Non-Streaming.csv', index=False)

### ***Isolation Forest***(Variant 1)
Naive Streaming Variant

In [8]:
modelName = 'IForest'

In [12]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    clf = IForest(n_jobs=7, random_state=42)
    x = ts['X_data']
    total_time = 0

    score = []
    #for par in range(n):
    for batch in ts['batched_X_data']:

        t0 = time()
        if len(batch) == 1:
            score.append(score[-1])
        else:
            clf.fit(batch)
            score.extend(clf.decision_scores_)
            t1 = time()

            total_time += t1 - t0
      
    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))

    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], score, ts['slidingWindow'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, len(x)])

In [13]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [14]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.855992,0.142573,214.901253,229801
1,ECG1_20k,0.858151,0.314074,18.81087,19901
2,IOPS1,0.506936,0.004854,9.491467,8497
3,SMD1,0.367254,0.001856,29.04586,28355
4,Occupancy1,0.711537,0.064815,2.619673,2541
5,ECG1+IOPS1,0.67554,0.0,29.097246,28685
6,SMD1+Occupancy1,0.503673,0.031642,31.288422,31020
7,ECG1+IOPS1+Occupancy1,0.758577,0.03184,31.077644,31350
8,SMD1+ECG1+Occupancy1,0.645449,0.03202,51.184335,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.598401,0.016494,60.564411,59829


In [15]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &       Time &  Number of Windows \\
\midrule
                      ECG1 & 0.855992 &     0.142573 & 214.901253 &             229801 \\
                  ECG1\_20k & 0.858151 &     0.314074 &  18.810870 &              19901 \\
                     IOPS1 & 0.506936 &     0.004854 &   9.491467 &               8497 \\
                      SMD1 & 0.367254 &     0.001856 &  29.045860 &              28355 \\
                Occupancy1 & 0.711537 &     0.064815 &   2.619673 &               2541 \\
                ECG1+IOPS1 & 0.675540 &     0.000000 &  29.097246 &              28685 \\
           SMD1+Occupancy1 & 0.503673 &     0.031642 &  31.288422 &              31020 \\
     ECG1+IOPS1+Occupancy1 & 0.758577 &     0.031840 &  31.077644 &              31350 \\
      SMD1+ECG1+Occupancy1 & 0.645449 &     0.032020 &  51.184335 &              51020 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.598401 &     0.016494 &  60

#### ***Use only for Dataset A***

In [29]:
iforest_res.to_csv('Results/Isolation-Forest/IForest_Streaming_Naive_Variant.csv', index=False)

In [35]:
iforest_orig_res = pd.read_csv('Results/Isolation-Forest/IForest_Non-Streaming.csv')
iforest_stream_var1_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Naive_Variant.csv')

filenames_col = iforest_orig_res.iloc[:,0]

iforest_orig_res = iforest_orig_res.iloc[:, 1:]
iforest_stream_var1_res = iforest_stream_var1_res.iloc[:, 1:]

In [None]:
res_diff =  iforest_stream_var1_res - iforest_orig_res

res_diff.insert(0, 'Filename', filenames_col)

res_diff = res_diff.style.applymap(highlight_diff, subset=pd.IndexSlice[:, res_diff.columns[1:]])

res_diff

### ***Isolation Forest***(Variant 2)
Streaming variant with batch history

In [22]:
modelName = 'IForest'

In [24]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    scores = []
    previous_scores = None
    x = ts['X_data']
    clf = IForest(n_jobs=7, random_state=42)
    total_time = 0
    
    for i, _ in enumerate(ts['batched_X_data']):
        
        if i == 0:
            X_train = ts['batched_X_data'][i]
        else:
            X_train = np.concatenate((ts['batched_X_data'][i-1], ts['batched_X_data'][i]))
        
        t0 = time()
        clf.fit(X_train)
        score = clf.decision_scores_

        if i > 0:
            previous_partition_length = len(ts['batched_X_data'][i-1])
            new_previous_scores = score[:previous_partition_length]
            mean_previous_scores = (previous_scores + new_previous_scores) / 2
            scores[-previous_partition_length:] = mean_previous_scores.tolist()

        current_partition_length = len(ts['batched_X_data'][i])
        current_scores = score[-current_partition_length:]
        scores.extend(current_scores)

        previous_scores = current_scores

        t1 = time()

        total_time += t1 - t0
    
    
    scores = np.array(scores)
    scores = MinMaxScaler(feature_range=(0, 1)).fit_transform(scores.reshape(-1, 1)).ravel()
    scores = np.array([scores[0]] * math.ceil((ts['slidingWindow']-1)/2) +
                         list(scores) +
                         [scores[-1]] * ((ts['slidingWindow']-1)//2))
    

    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], scores, ts['slidingWindow'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, len(x)])

In [25]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [26]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.83255,0.155556,243.160621,229801
1,ECG1_20k,0.831822,0.371852,21.308003,19901
2,IOPS1,0.48198,0.0,10.723807,8497
3,SMD1,0.3207,0.0,31.594669,28355
4,Occupancy1,0.763412,0.065844,2.820671,2541
5,ECG1+IOPS1,0.668095,0.0,31.753022,28685
6,SMD1+Occupancy1,0.478901,0.039825,34.556695,31020
7,ECG1+IOPS1+Occupancy1,0.761818,0.031301,33.960795,31350
8,SMD1+ECG1+Occupancy1,0.623556,0.041465,56.347982,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.574619,0.017594,64.708527,59829


In [27]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &       Time &  Number of Windows \\
\midrule
                      ECG1 & 0.832550 &     0.155556 & 243.160621 &             229801 \\
                  ECG1\_20k & 0.831822 &     0.371852 &  21.308003 &              19901 \\
                     IOPS1 & 0.481980 &     0.000000 &  10.723807 &               8497 \\
                      SMD1 & 0.320700 &     0.000000 &  31.594669 &              28355 \\
                Occupancy1 & 0.763412 &     0.065844 &   2.820671 &               2541 \\
                ECG1+IOPS1 & 0.668095 &     0.000000 &  31.753022 &              28685 \\
           SMD1+Occupancy1 & 0.478901 &     0.039825 &  34.556695 &              31020 \\
     ECG1+IOPS1+Occupancy1 & 0.761818 &     0.031301 &  33.960795 &              31350 \\
      SMD1+ECG1+Occupancy1 & 0.623556 &     0.041465 &  56.347982 &              51020 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.574619 &     0.017594 &  64

#### ***Use only for Dataset A***

In [33]:
iforest_res.to_csv('Results/Isolation-Forest/IForest_Streaming_Batch_History_Variant.csv', index=False)

In [34]:
iforest_orig_res = pd.read_csv('Results/Isolation-Forest/IForest_Non-Streaming.csv')
iforest_stream_var1_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Naive_Variant.csv')
iforest_stream_var2_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Batch_History_Variant.csv')

filenames_col = iforest_orig_res.iloc[:,0]

iforest_orig_res = iforest_orig_res.iloc[:, 1:]
iforest_stream_var1_res = iforest_stream_var1_res.iloc[:, 1:]
iforest_stream_var2_res = iforest_stream_var2_res.iloc[:, 1:]

In [None]:
res_diff =  iforest_stream_var2_res - iforest_stream_var1_res

res_diff.insert(0, 'Filename', filenames_col)

res_diff = res_diff.style.applymap(highlight_diff, subset=pd.IndexSlice[:, res_diff.columns[1:]])

res_diff

### ***Isolation Forest***(Variant 3)
Dynamic partitioning and classification based on ensemblers

In [37]:
modelName = 'IForest'

Evaluates the last p points of the previous partition with both classifiers and replaces scores if they disagree.

In [38]:
results = []
disagreement_threshold = 0.5

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    scores = []
    previous_scores = None
    x_data_partitions_len = 0
    clf = IForest(n_jobs=7, random_state=42)
    total_time = 0

    for par in range(len(ts['data partitions'])):

        if par == 0 or par == 1:
            partition = ts['data partitions'][par]
            slidingWindow = find_length(partition)
            X_train = Window(window=slidingWindow).convert(partition).to_numpy()
        else:
            previous_partition = ts['data partitions'][par-1]
            partition = ts['data partitions'][par]
            last_p_points = previous_partition[-p:]
            partition_with_history = np.concatenate((last_p_points, partition))
            slidingWindow = find_length(partition_with_history)
            X_train = Window(window=slidingWindow).convert(partition_with_history).to_numpy()
        
        x_data_partitions_len += len(X_train)

        t0 = time()
        clf.fit(X_train)
        score = clf.decision_scores_

        score = MinMaxScaler(feature_range=(0, 1)).fit_transform(score.reshape(-1, 1)).ravel()
        score = np.array([score[0]] * math.ceil((slidingWindow-1)/2) +
                        list(score) +
                        [score[-1]] * ((slidingWindow-1)//2))
        
        if par > 1:
            previous_partition_length = len(previous_partition)
            new_previous_scores = score[:p]
            prev_scores_to_compare = previous_scores[-p:]
            
            disagreement_indices = np.where(np.abs(prev_scores_to_compare - new_previous_scores) > disagreement_threshold)[0]
            if len(disagreement_indices) > p * 0.5:
                previous_scores[-p:] = new_previous_scores
            
            scores[-p:] = previous_scores[-p:]
            current_scores = score[p:]
        else:
            current_scores = score

        scores.extend(current_scores)
        previous_scores = current_scores

        t1 = time()
        total_time += t1 - t0

    
    scores = np.array(scores)
    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], scores, ts['global_sliding_window'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, x_data_partitions_len])

In [39]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [40]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.643103,0.0,92.059422,390369
1,ECG1_20k,0.680255,0.001481,8.076657,33876
2,IOPS1,0.474765,0.0,3.985141,11219
3,SMD1,0.498094,0.024499,8.856705,48238
4,Occupancy1,0.804776,0.0,0.813956,3169
5,ECG1+IOPS1,0.590247,0.0,12.332971,46367
6,SMD1+Occupancy1,0.595825,0.06874,9.912819,52664
7,ECG1+IOPS1+Occupancy1,0.689718,0.0,12.827962,50322
8,SMD1+ECG1+Occupancy1,0.598437,0.0,18.232967,86849
9,ECG1+IOPS1+SMD1+Occupancy1,0.54907,0.0,22.739442,100196


In [53]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &      Time &  Number of Windows \\
\midrule
                      ECG1 & 0.726683 &     0.007629 & 74.442923 &             342494 \\
                  ECG1\_20k & 0.736483 &     0.007407 &  6.702948 &              30106 \\
                     IOPS1 & 0.527711 &     0.000000 &  2.221043 &               8210 \\
                      SMD1 & 0.599773 &     0.122123 &  2.909586 &              29715 \\
                Occupancy1 & 0.861513 &     0.000000 &  0.485179 &               2417 \\
                ECG1+IOPS1 & 0.764026 &     0.044268 &  6.173537 &              37882 \\
           SMD1+Occupancy1 & 0.696195 &     0.125477 &  4.302852 &              32638 \\
     ECG1+IOPS1+Occupancy1 & 0.796491 &     0.093902 &  7.489961 &              40404 \\
      SMD1+ECG1+Occupancy1 & 0.626209 &     0.029256 & 12.268638 &              67581 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.735903 &     0.096767 & 15.090683 &  

#### ***Use only for Dataset A***

In [98]:
iforest_res.to_csv('Results/Isolation-Forest/IForest_Streaming_Dynamic_Partitioning_Variant.csv', index=False)

In [99]:
iforest_orig_res = pd.read_csv('Results/Isolation-Forest/IForest_Non-Streaming.csv')
iforest_stream_var1_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Naive_Variant.csv')
iforest_stream_var2_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Batch_History_Variant.csv')
iforest_stream_var3_res = pd.read_csv('Results/Isolation-Forest/IForest_Streaming_Dynamic_Partitioning_Variant.csv')


filenames_col = iforest_orig_res.iloc[:,0]

iforest_orig_res = iforest_orig_res.iloc[:, 1:]
iforest_stream_var1_res = iforest_stream_var1_res.iloc[:, 1:]
iforest_stream_var2_res = iforest_stream_var2_res.iloc[:, 1:]
iforest_stream_var3_res = iforest_stream_var3_res.iloc[:, 1:]

In [None]:
res_diff =  iforest_stream_var3_res - iforest_stream_var2_res

res_diff.insert(0, 'Filename', filenames_col)

res_diff = res_diff.style.applymap(highlight_diff, subset=pd.IndexSlice[:, res_diff.columns[1:]])

res_diff