In [1]:
import os
import random
import glob
import pandas as pd
import numpy as np


In [2]:
%cd .. 

c:\Users\maher\Unsupervised-anomaly-detection-on-noisy-time-series-data-for-accurate-load-forecasting


Loading and train/test splitting

In [3]:
csv_data_path = "data/inpg_dataset/csv_data/"
feature_name = "conso_global"

clean_contam_split = 0.1
train_test_split = 0.7
window_size = 24*3
step = 24

In [4]:
X = []
csv_paths = glob.iglob(os.path.join(csv_data_path, "*.csv"))

for csv_path in csv_paths:
    try: csv_file = pd.read_csv(csv_path)
    except: continue
    
    serie = csv_file[feature_name].values
    start, end = 0, window_size
    while end<=len(serie):
        sliding_window = serie[start: end]
        start += step
        end += step
        sliding_window = (sliding_window-sliding_window.mean())/sliding_window.std() # prevents exploding gradients
        X.append(sliding_window)
X = np.stack(X)

N = int(clean_contam_split*X.shape[0])
M = int(train_test_split*X.shape[0])

clean_data = X[0:N, :]
train_data = X[N:M, :] 
test_data = X[M:, :]   

Injecting anomalies into both train and test data

In [5]:
contam_prob = 0.05
min_nbr_anom = 5
max_nbr_anom = 10

In [6]:
train_contam_data = []
train_anom_idx = []
for sample_idx in range(train_data.shape[0]):
    sample = train_data[sample_idx, :].copy()
    peak_value = max(sample.max(), abs(sample.min()))
    is_contam = random.random()<=contam_prob
    if is_contam:
        nbr_anom = random.randint(min_nbr_anom, max_nbr_anom)
        sample_anom_idx = []
        for _ in range(nbr_anom):
            idx = random.randint(0, len(sample)-1)
            sample[idx] = (peak_value+random.random()*peak_value)*-1**random.randint(0, 1)
            sample_anom_idx.append(idx)
        train_contam_data.append(sample)
        train_anom_idx.append(sample_anom_idx)
    else:
        train_contam_data.append(sample)
        train_anom_idx.append([])


In [7]:
test_contam_data = []
test_anom_idx = [] # indices of where the anomalies are for each sequence
for sample_idx in range(test_data.shape[0]):
    sample = test_data[sample_idx, :].copy()
    peak_value = max(sample.max(), abs(sample.min()))
    is_contam = random.random()<=contam_prob
    if is_contam:
        sample_anom_idx = []
        nbr_anom = random.randint(min_nbr_anom, max_nbr_anom)
        for _ in range(nbr_anom):
            idx = random.randint(0, len(sample)-1)
            sample[idx] = (peak_value+random.random()*peak_value)*-1**random.randint(0, 1)
            sample_anom_idx.append(idx)
        test_contam_data.append(sample)
        test_anom_idx.append(sample_anom_idx)
    else:
        test_contam_data.append(sample)
        test_anom_idx.append([])

Saving dataset

In [8]:
def delete_files_in_directory(directory_path):
    try:
        for root, _, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                os.remove(file_path)
        print(f"All files in '{directory_path}' have been deleted.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

for path in [
    # "data/aemo_dataset/npy_data/", 
    "data/inpg_dataset/npy_data/",
    "data/IRISE_dataset/npy_data/"
    ]:
    delete_files_in_directory(path)

All files in 'data/inpg_dataset/npy_data/' have been deleted.
All files in 'data/IRISE_dataset/npy_data/' have been deleted.


In [9]:
npy_data_path = "data/inpg_dataset/npy_data/"

for i, sample in enumerate(clean_data): # clean train data to later evalute forecasting performance
    np.save(os.path.join(npy_data_path, "clean", str(i)), sample)

for i, (sample, anom_idx) in enumerate(zip(train_contam_data, train_anom_idx)): # contaminated train data
    np.save(os.path.join(npy_data_path, "train", "data", str(i)), sample)
    np.save(os.path.join(npy_data_path, "train", "gt", str(i)), anom_idx)

for i, (sample, anom_idx) in enumerate(zip(test_contam_data, test_anom_idx)):   # contaminated test data
    np.save(os.path.join(npy_data_path, "test", "data", str(i)), sample)
    np.save(os.path.join(npy_data_path, "test", "gt", str(i)), anom_idx)
