# ***Libraries***

In [1]:
import math
import os
import sys
import json
from time import time

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle as pkl


from sklearn.preprocessing import MinMaxScaler

In [2]:
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [3]:
sys.path.append(parent_dir)

In [4]:
from TSB_UAD.models.distance import Fourier
from TSB_UAD.models.feature import Window
from TSB_UAD.utils.slidingWindows import find_length, plotFig, printResult

from TSB_UAD.models.iforest import IForest

# ***Isolation Forest***


## ***Data Pre-Processing***

### ***Dataset A***
Dataset A utlizes time-series from the following domains:
- Occupancy
- SensorScope
- NAB
- NASA-MSL
- SMD
- YAHOO

In [87]:
# Load the Time-Series dictionary
with open('Time-Series_Data_Dictionaries/Time-Series-Random-Data-of-Interest-Dictionary.json', 'r') as json_file:
    loaded_dict = json.load(json_file)

Let's see some info about the generated files

In [88]:
for filename, info in loaded_dict.items():
    print(f'{filename}: {info}')

ts1: ['Normality_1', 'SensorScope']
ts3: ['Normality_1', 'NASA-MSL']
ts4: ['Normality_1', 'YAHOO']
ts5: ['Normality_1', 'SMD']
ts8: ['Normality_1', 'SMD']
ts2: ['Normality_2', 'SensorScope', 'NAB']
ts9: ['Normality_2', 'Occupancy', 'NASA-MSL']
ts6: ['Normality_3', 'SensorScope', 'YAHOO', 'NASA-MSL']
ts7: ['Normality_3', 'YAHOO', 'NASA-MSL', 'SMD']


### ***Dataset B (Selected)***
Dataset B utilizes the following time-series:
- ECG1
- ECG1_20k
- IOPS1
- SMD1
- Occupancy1
- ECG1+IOPS1
- SMD1+Occupancy1 
- ECG1+IOPS1+Occupancy1
- SMD1+ECG1+Occupancy1
- ECG1+IOPS1+SMD1+Occupancy1

In [5]:
# Load the data for the evaluation.
all_data = []

with open('dataset.pkl', 'rb') as f:
    data = pkl.load(f)

all_data.extend(data['evaluation']['single_normality'])
all_data.extend(data['evaluation']['double_normality'])
all_data.extend(data['evaluation']['triple_normality'])
all_data.extend(data['evaluation']['quadruple_normality'])

In [33]:
preprocessed_dict = {}

### ***Pre-processing for non-streaming***
Simple data pre-processing based on TSB-UAD. This pre-processing serves as the pre-processing baseline

In [9]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

### ***Pre-processing for both naive streaming variant and streaming variant with batch history***

In [20]:
# Set the number of windows to be fit per batch.
windows_per_batch = 150

for timeseries in all_data:
    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    # Take the series and batch it.
    batched_data = []

    i = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data.append(data[batch_samples_begin:batch_samples_end])

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1

    # Take the windows and batch them.
    batched_X_data = []
    i = 0
    while i < len(X_data):
        begin = i
        end = i + windows_per_batch
        if end > len(X_data):
            end = len(X_data)

        batched_X_data.append(X_data[begin:end])
        i += windows_per_batch

    print(f'Time-Series name: {name}')
    print("Estimated Subsequence length: ", slidingWindow)
    print()
    
    # Store the pre-processed variables in the new dictionary
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'batched_X_data': batched_X_data,
        'batched_data': batched_data,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

Time-Series name: ECG1
Estimated Subsequence length:  100

Time-Series name: ECG1_20k
Estimated Subsequence length:  100

Time-Series name: IOPS1
Estimated Subsequence length:  288

Time-Series name: SMD1
Estimated Subsequence length:  125

Time-Series name: Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1
Estimated Subsequence length:  100

Time-Series name: SMD1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+Occupancy1
Estimated Subsequence length:  100

Time-Series name: SMD1+ECG1+Occupancy1
Estimated Subsequence length:  125

Time-Series name: ECG1+IOPS1+SMD1+Occupancy1
Estimated Subsequence length:  100



### ***Pre-processing for streaming variant with dynamic partitioning (change point detection)***
Naively partitioning the data is not a reliable solution. We want to partition the data as soon as an abrupt change occurs. For that, we can use:
- 1. MinMax range partitioning
- 2. Percentile Partitioning

#### ***MinMax range partitioning***

In [28]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data)

    initial_partition_length = int(len(data) * 0.25)
    initial_partition = data[:initial_partition_length]

    max = np.max(initial_partition)
    min = np.min(initial_partition)

    data_partitions = [initial_partition]
    current_partition = []
    change_detected = False

    p = 500
    change_point_threshold = 0.8
    exceed_threshold = 0.5
    post_change_points = []

    for point in data[len(initial_partition):]:
        
        # Check for significant change
        if (point > max * (1 + change_point_threshold)) or (point < min * (1 - change_point_threshold)):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt > max * (1 + change_point_threshold) or pt < min * (1 - change_point_threshold)) for pt in post_change_points]
                if sum(exceeds_threshold_points) >= exceed_threshold * p:
                    max = np.mean([max] + [pt for pt in post_change_points if pt > max])
                    min = np.mean([min] + [pt for pt in post_change_points if pt < min])

                post_change_points = []

                # Add the current partition to data partitions
                data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to data_partitions
    if current_partition:
        data_partitions.append(np.array(current_partition))

    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'data partitions': data_partitions,
        'global_sliding_window': global_sw,
    }

Let's see the number of partitions created for each time-series

In [29]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    print(f"Number of partitions: {len(ts['data partitions'])} for file: {ts['name']}")

Number of partitions: 414 for file: ECG1
Number of partitions: 32 for file: ECG1_20k
Number of partitions: 3 for file: IOPS1
Number of partitions: 2 for file: SMD1
Number of partitions: 2 for file: Occupancy1
Number of partitions: 33 for file: ECG1+IOPS1
Number of partitions: 3 for file: SMD1+Occupancy1
Number of partitions: 34 for file: ECG1+IOPS1+Occupancy1
Number of partitions: 42 for file: SMD1+ECG1+Occupancy1
Number of partitions: 34 for file: ECG1+IOPS1+SMD1+Occupancy1


Are the size of the partitions consistent with the initial size of the time-series?

In [30]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    par_size = 0
    for partition in ts['data partitions']:
        par_size += len(partition)
    
    print(f"Total size of partitions: {par_size} for file: {ts['name']}. Original data size: {len(ts['data'])}")

Total size of partitions: 229900 for file: ECG1. Original data size: 229900
Total size of partitions: 20000 for file: ECG1_20k. Original data size: 20000
Total size of partitions: 8784 for file: IOPS1. Original data size: 8784
Total size of partitions: 28479 for file: SMD1. Original data size: 28479
Total size of partitions: 2665 for file: Occupancy1. Original data size: 2665
Total size of partitions: 28784 for file: ECG1+IOPS1. Original data size: 28784
Total size of partitions: 31144 for file: SMD1+Occupancy1. Original data size: 31144
Total size of partitions: 31449 for file: ECG1+IOPS1+Occupancy1. Original data size: 31449
Total size of partitions: 51144 for file: SMD1+ECG1+Occupancy1. Original data size: 51144
Total size of partitions: 59928 for file: ECG1+IOPS1+SMD1+Occupancy1. Original data size: 59928


In [None]:
for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]

    if len(ts['data partitions']) < 10:

        fig, axes = plt.subplots(1, len(ts['data partitions']), figsize=(20, 5))

        for i, array in enumerate(ts['data partitions']):
            axes[i].plot(array)
            axes[i].set_title(f"Partition {i+1} of {ts['name']}")

        plt.tight_layout()
        plt.show()

#### ***Percentile Partitioning***

In [34]:
#for filename, info in loaded_dict.items():
for timeseries in all_data:
    #ts_filepath = f"TS-Data-Files/{filename}"
    #ts = pd.read_csv(ts_filepath, header=None).dropna().to_numpy()

    #name = ts_filepath.split('/')[-1]
    #max_length = ts.shape[0]
    #data = ts[:max_length, 0].astype(float)
    #label = ts[:max_length, 1]

    name = timeseries['Name']
    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']
    global_sw = find_length(data)


    # Filter the normal points (label == 0)
    normal_indices = [i for i, lbl in enumerate(label) if lbl == 0]
    normal_data = data[normal_indices]

    normal_data_par_length = int(len(normal_data) * 0.10)
    normal_data = normal_data[:normal_data_par_length]

    #initial_partition_length = int(len(data) * 0.25)
    #initial_partition = data[:initial_partition_length]

    # Compute initial percentiles
    percentile_5 = np.percentile(normal_data, 5)
    percentile_95 = np.percentile(normal_data, 95)

    data_partitions = []
    current_partition = []
    change_detected = False
    p = 500
    exceed_threshold = 0.5
    post_change_points = []

    for point in data[:]:
        
        # Check for significant change
        if (point < percentile_5) or (point > percentile_95):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt < percentile_5 or pt > percentile_95) for pt in post_change_points]
                if sum(exceeds_threshold_points) / p >= exceed_threshold:
                    # Update percentiles
                    percentile_5 = np.percentile(post_change_points, 5)
                    percentile_95 = np.percentile(post_change_points, 95)

                post_change_points = []
                # Add the current partition to data partitions
                data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to data_partitions
    if current_partition:
        data_partitions.append(np.array(current_partition))

    
    preprocessed_dict[name] = {
        'name': name,
        'data': data,
        'label': label,
        'data partitions': data_partitions,
        'global_sliding_window': global_sw,
    }

## ***Anomaly Detection***


### ***Isolation Forest***(Variant 2)
Streaming variant with batch history

In [22]:
modelName = 'IForest'

In [24]:
results = []

for filename in preprocessed_dict.keys():
    ts = preprocessed_dict[filename]
    scores = []
    previous_scores = None
    x = ts['X_data']
    clf = IForest(n_jobs=7, random_state=42)
    total_time = 0
    
    for i, _ in enumerate(ts['batched_X_data']):
        
        if i == 0:
            X_train = ts['batched_X_data'][i]
        else:
            X_train = np.concatenate((ts['batched_X_data'][i-1], ts['batched_X_data'][i]))
        
        t0 = time()
        clf.fit(X_train)
        score = clf.decision_scores_

        if i > 0:
            previous_partition_length = len(ts['batched_X_data'][i-1])
            new_previous_scores = score[:previous_partition_length]
            mean_previous_scores = (previous_scores + new_previous_scores) / 2
            scores[-previous_partition_length:] = mean_previous_scores.tolist()

        current_partition_length = len(ts['batched_X_data'][i])
        current_scores = score[-current_partition_length:]
        scores.extend(current_scores)

        previous_scores = current_scores

        t1 = time()

        total_time += t1 - t0
    
    
    scores = np.array(scores)
    scores = MinMaxScaler(feature_range=(0, 1)).fit_transform(scores.reshape(-1, 1)).ravel()
    scores = np.array([scores[0]] * math.ceil((ts['slidingWindow']-1)/2) +
                         list(scores) +
                         [scores[-1]] * ((ts['slidingWindow']-1)//2))
    

    # Plot figure
    #plotFig(ts['data'], ts['label'], scores, ts['global_sliding_window'], fileName=ts['name'] + ' ' + loaded_dict[ts['name']][0], modelName=modelName)

    # Calculate the results
    L = printResult(ts['data'], ts['label'], scores, ts['slidingWindow'], ts['name'], modelName)
    #L = [ '%.2f' % elem for elem in L]
    #results.append([filename] + L)
    results.append([filename] + L + [total_time, len(x)])

In [25]:
#columns = ['Filename'] + eval_metrics
columns = ['Name'] + ['AUC', 'Precision', 'Recall', 'F-score', 'Range-recall', 'ExistenceReward', 'OverlapReward', 'Range-precision', 'Range-Fscore', 'Precision@k', 'RangeAUC', 'Time', 'Number of Windows']
iforest_res = pd.DataFrame(results, columns=columns)

In [26]:
iforest_res['Number of anomalies'] = iforest_res['Name'].apply(lambda x: np.sum(preprocessed_dict[x]['label']))
iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']]

Unnamed: 0,Name,AUC,Precision@k,Time,Number of Windows
0,ECG1,0.83255,0.155556,243.160621,229801
1,ECG1_20k,0.831822,0.371852,21.308003,19901
2,IOPS1,0.48198,0.0,10.723807,8497
3,SMD1,0.3207,0.0,31.594669,28355
4,Occupancy1,0.763412,0.065844,2.820671,2541
5,ECG1+IOPS1,0.668095,0.0,31.753022,28685
6,SMD1+Occupancy1,0.478901,0.039825,34.556695,31020
7,ECG1+IOPS1+Occupancy1,0.761818,0.031301,33.960795,31350
8,SMD1+ECG1+Occupancy1,0.623556,0.041465,56.347982,51020
9,ECG1+IOPS1+SMD1+Occupancy1,0.574619,0.017594,64.708527,59829


In [27]:
print(iforest_res[['Name', 'AUC', 'Precision@k', 'Time', 'Number of Windows']].to_latex(index=False))

\begin{tabular}{lrrrr}
\toprule
                      Name &      AUC &  Precision@k &       Time &  Number of Windows \\
\midrule
                      ECG1 & 0.832550 &     0.155556 & 243.160621 &             229801 \\
                  ECG1\_20k & 0.831822 &     0.371852 &  21.308003 &              19901 \\
                     IOPS1 & 0.481980 &     0.000000 &  10.723807 &               8497 \\
                      SMD1 & 0.320700 &     0.000000 &  31.594669 &              28355 \\
                Occupancy1 & 0.763412 &     0.065844 &   2.820671 &               2541 \\
                ECG1+IOPS1 & 0.668095 &     0.000000 &  31.753022 &              28685 \\
           SMD1+Occupancy1 & 0.478901 &     0.039825 &  34.556695 &              31020 \\
     ECG1+IOPS1+Occupancy1 & 0.761818 &     0.031301 &  33.960795 &              31350 \\
      SMD1+ECG1+Occupancy1 & 0.623556 &     0.041465 &  56.347982 &              51020 \\
ECG1+IOPS1+SMD1+Occupancy1 & 0.574619 &     0.017594 &  64

## Isolation Forest Tuning

In [None]:
# Load the data for the evaluation.
all_data = []

with open('dataset.pkl', 'rb') as f:
    data = pkl.load(f)

all_data.extend(data['evaluation']['single_normality'])
all_data.extend(data['evaluation']['double_normality'])
all_data.extend(data['evaluation']['triple_normality'])
all_data.extend(data['evaluation']['quadruple_normality'])
name_to_eval_series = {ts['Name']:ts for ts in all_data}

tuning_data = []
tuning_data.extend(data['tuning']['single_normality'])
tuning_data.extend(data['tuning']['double_normality'])
tuning_data.extend(data['tuning']['triple_normality'])
tuning_data.extend(data['tuning']['quadruple_normality'])
name_to_tune_series = {ts['Name']:ts for ts in tuning_data}

# Set the number of windows to be fit per batch.
windows_per_batch = 150

def preprocess_series(series, slidingWindow=None, verbose=True):
    # === Pre-processing steps ===

    # Prepare data for unsupervised method
    name = timeseries['Name']

    data = timeseries['data']
    max_length = data.shape[0]
    label = timeseries['labels']

    if slidingWindow is None:
        slidingWindow = find_length(data)
    X_data = Window(window=slidingWindow).convert(data).to_numpy()

    # Take the series and batch it.
    batched_data = []

    i = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data.append(data[batch_samples_begin:batch_samples_end])

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1


    # Take the series and batch it for history batching: For each batch, also append have access to the data of the previous batch.
    batched_data_previous_access = []
    i = 0
    previous_window_beginning = 0
    flag = True
    # Keep taking batches until the point at which no new windows can be taken.
    while i < len(data) and flag:
        # The data batches begin at the index indicated. If first batch, then the beginning of the time series.
        batch_samples_begin = i

        # The data batches end at the index where `windows_per_batch` can be *completely* extracted since the batch beginning. 
        # Formula: 
        #   i: current beginning of batch / offset
        #   + slidingWindow: to have enough samples extract one window
        #   + windows_per_batch: to have enough samples to extract the rest of the windows
        #   - 1: because the first window extracted is counted twice
        batch_samples_end = i + windows_per_batch + slidingWindow - 1
        
        # Guard against the ending of the time series where a full batch cannot be formed.
        if batch_samples_end > len(data):
            batch_samples_end = len(data)
            flag = False
 
        # Guard against case where the batch cannot hold even one window.
        if len(data[batch_samples_begin:batch_samples_end]) < slidingWindow:
            break

        batched_data_previous_access.append(data[previous_window_beginning:batch_samples_end])

        previous_window_beginning = batch_samples_begin

        # The next batch starts at the point where a new window be created after the last window of the last batch.
        # So, end of the previous window - length of window = start of the last window.
        #   start of the last window + 1 = start of the first window of the next batch.
        i = batch_samples_end - slidingWindow + 1

    # Take the windows and batch them.
    batched_X_data = []
    i = 0
    while i < len(X_data):
        begin = i
        end = i + windows_per_batch
        if end > len(X_data):
            end = len(X_data)

        batched_X_data.append(X_data[begin:end])
        i += windows_per_batch


    # Processing data for dynamic partitioning.
    initial_partition_length = int(len(data) * 0.25)
    initial_partition = data[:initial_partition_length]

    max_v = np.max(initial_partition)
    min_v = np.min(initial_partition)

    data_partitions = [initial_partition]
    current_partition = []
    change_detected = False

    p = 500
    change_point_threshold = 0.5
    exceed_threshold = 0.65
    post_change_points = []

    for point in data[initial_partition_length:]:
        
        # Check for significant change
        if (point > max_v * (1 + change_point_threshold)) or (point < min_v * (1 - change_point_threshold)):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt > max_v * (1 + change_point_threshold) or pt < min_v * (1 - change_point_threshold)) for pt in post_change_points]
                if sum(exceeds_threshold_points) >= exceed_threshold * p:
                    max_v = np.mean([max_v] + [pt for pt in post_change_points if pt > max_v])
                    min_v = np.mean([min_v] + [pt for pt in post_change_points if pt < min_v])

                post_change_points = []

                # Add the current partition to data partitions
                data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to data_partitions
    if current_partition:
        data_partitions.append(np.array(current_partition))

    # Processing data for dynamic partitioning (Percentile variant)
    initial_partition_length = int(len(data) * 0.25)
    initial_partition = data[:initial_partition_length]

    # Compute initial percentiles
    percentile_5 = np.percentile(initial_partition, 5)
    percentile_95 = np.percentile(initial_partition, 95)

    percentile_data_partitions = []
    current_partition = []
    change_detected = False
    p = 500
    exceed_threshold = 0.5
    post_change_points = []

    for point in data[initial_partition_length:]:
        
        # Check for significant change
        if (point < percentile_5) or (point > percentile_95):
            change_detected = True
     
        current_partition.append(point)


        # After change, collect additional points
        if change_detected:
            post_change_points.append(point)
            if len(post_change_points) == p:
                exceeds_threshold_points = [(pt < percentile_5 or pt > percentile_95) for pt in post_change_points]
                if sum(exceeds_threshold_points) / p >= exceed_threshold:
                    # Update percentiles
                    percentile_5 = np.percentile(post_change_points, 5)
                    percentile_95 = np.percentile(post_change_points, 95)

                post_change_points = []
                # Add the current partition to data partitions
                percentile_data_partitions.append(np.array(current_partition))
                current_partition = []
                change_detected = False
                
        
    # Add any remaining points in current_partition to percentile_data_partitions
    if current_partition:
        percentile_data_partitions.append(np.array(current_partition))


    if verbose:
        print(f'Time-Series name: {name}')
        print("Estimated Subsequence length: ", slidingWindow)
        print()

    return {
        'name': name,
        'data': data,
        'label': label,
        'slidingWindow': slidingWindow,
        'X_data': X_data,
        'batched_X_data': batched_X_data,
        'batched_data': batched_data,
        'points_per_batch': len(batched_data[0]),
        'history_batched_data': batched_data_previous_access,
        'dynamic_partitioning_batches': data_partitions,
        'percentile_dynamic_partitioning_batches': percentile_data_partitions,
        'Time series length': len(data),
        'Number of abnormal points': list(label).count(1)
    }

In [None]:
from collections import defaultdict

results = []

# Parameters for tuning.
param_grid = {
    # Using the estimated window length from the autocorrelation, define alternate window sized as fractions/multiples of that.
    'window_length_modifier': [0.1, 0.5, 1.0, 1.5, 2.0, 5.0], 
    # Number of trees in the isolation forest.
    'n_estimators': [10, 20, 50, 100, 200, 500],
}

params_to_AUC = defaultdict(dict)

total = np.product([len(pl) for pl in param_grid.values()])

for timeseries in (p := tqdm(tuning_data)):
    name = timeseries['Name']

    p.set_description(name)

    c = 0
    best_AUC = 0
    # Initial Best parameters are the defaults.
    best_params = (1, 100)
    for window_length_modifier in param_grid['window_length_modifier']:
        for n_estimators in param_grid['n_estimators']:
            # Prevent too small windows.
            window_size = max(10, int(window_length_modifier * ts['slidingWindow']))

            ts = preprocess_series(series=timeseries, slidingWindow=window_size, verbose=False)

            x = ts['X_data']
            clf = IForest(n_jobs=10, random_state=42, n_estimators=n_estimators)
            total_time = 0
            scores = []

            for i, _ in enumerate(ts['batched_X_data']):
                
                if i == 0:
                    X_train = ts['batched_X_data'][i]
                else:
                    X_train = np.concatenate((ts['batched_X_data'][i-1], ts['batched_X_data'][i]))
                
                t0 = time()
                clf.fit(X_train)
                score = clf.decision_scores_

                if i > 0:
                    previous_partition_length = len(ts['batched_X_data'][i-1])
                    new_previous_scores = score[:previous_partition_length]
                    mean_previous_scores = (previous_scores + new_previous_scores) / 2
                    scores[-previous_partition_length:] = mean_previous_scores.tolist()

                current_partition_length = len(ts['batched_X_data'][i])
                current_scores = score[-current_partition_length:]
                scores.extend(current_scores)

                previous_scores = current_scores

                t1 = time()

                total_time += t1 - t0

            score = np.array(score)
            score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
            score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
            
            AUC = printResult(ts['data'], ts['label'], score, window_size, ts['name'], modelName)[0]

            params_to_AUC[name][(window_length_modifier, n_estimators)] = AUC

            if AUC > best_AUC:
                best_AUC = AUC
                best_params = (window_length_modifier, n_estimators)

            c+=1
            print(f"\r[{c}/{total}]{name}  --  Best AUC = {best_AUC} for: {best_params}", end='')
    print()
    print(f"{name}  --  Best AUC = {best_AUC} for: {best_params}")

    # Evaluate evaluation time series with selected parameters.    
    window_size = max(10, int(ts['slidingWindow'] * best_params[0]))
    n_estimators = best_params[1]

    eval_series_name = ''.join([n if n!='2' else '1' for n in name]).replace('10k', '20k')  # Replace 2s with 1s and fix 20k becoming 10k accidentally.
    ts = preprocess_series(series=timeseries, slidingWindow=window_size, verbose=False)

    x = ts['X_data']
    clf = IForest(n_jobs=10, random_state=42, n_estimators=n_estimators)
    total_time = 0
    scores = []

    for i, _ in enumerate(ts['batched_X_data']):
        
        if i == 0:
            X_train = ts['batched_X_data'][i]
        else:
            X_train = np.concatenate((ts['batched_X_data'][i-1], ts['batched_X_data'][i]))
        
        t0 = time()
        clf.fit(X_train)
        score = clf.decision_scores_

        if i > 0:
            previous_partition_length = len(ts['batched_X_data'][i-1])
            new_previous_scores = score[:previous_partition_length]
            mean_previous_scores = (previous_scores + new_previous_scores) / 2
            scores[-previous_partition_length:] = mean_previous_scores.tolist()

        current_partition_length = len(ts['batched_X_data'][i])
        current_scores = score[-current_partition_length:]
        scores.extend(current_scores)

        previous_scores = current_scores

        t1 = time()

        total_time += t1 - t0

    score = np.array(score)
    score = MinMaxScaler(feature_range=(0,1)).fit_transform(score.reshape(-1,1)).ravel()
    score = np.array([score[0]]*math.ceil((ts['slidingWindow']-1)/2) + list(score) + [score[-1]]*((ts['slidingWindow']-1)//2))
    
    L = printResult(ts['data'], ts['label'], score, window_size, ts['name'], modelName)
    print(f"{eval_series_name}  --  Eval AUC = {L[0]}")
    results.append([name] + L + [t1-t0, len(x)])

    print()
    print('----------------------------------------------------------------')
    sleep(1)