In [1]:
import os
import random
import glob
import pandas as pd
import numpy as np


In [2]:
%cd .. 

c:\Users\maher\Unsupervised-anomaly-detection-on-noisy-time-series-data-for-accurate-load-forecasting


Collecting data from AEMO

In [None]:
import subprocess

for year in range(2000, 2021+1): 
    for month in ['01','02','03','04','05','06','07','08','09','10','11','12']:
        for location in ["NSW", "QLD", "VIC", "SA", "TAS"]:
            bash_command = f"curl https://aemo.com.au/aemo/data/nem/priceanddemand/PRICE_AND_DEMAND_{year}06_NSW1.csv > ./aemo_dataset/{year}{month}{location}.csv"
            result = subprocess.run(bash_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            if result.returncode != 0:
                print("Command failed:")
                print(result.stderr)
            else:
                print(f"Downloaded {year}-{month}-{location}.csv")


Loading and train/test splitting

In [3]:
data_path = "data/aemo_csv_data/"
train_test_split = 0.7
window_size = 48*5
step = 48

In [4]:
X = []
csv_paths = glob.iglob(os.path.join(data_path, "*.csv"))
for csv_path in csv_paths:
    try: csv_file = pd.read_csv(csv_path)
    except: continue
    serie = csv_file["TOTALDEMAND"].values
    start, end = 0, window_size
    while end<=len(serie):
        sliding_window = serie[start: end]
        start += step
        end += step
        sliding_window = (sliding_window-sliding_window.mean())/sliding_window.std() # prevents exploding gradients
        X.append(sliding_window)
X = np.stack(X)

N = int(train_test_split*X.shape[0])
train_data = X[0:N, :] # clean data
test_data = X[N:, :]   # for evaluating reconstruction of unseen data

Adding anomalies to both train and test data

In [5]:
contam_prob = 0.1
min_nbr_anom = 5
max_nbr_anom = 10

In [6]:
train_contam_data = []
train_anom_idx = []
for sample_idx in range(train_data.shape[0]):
    sample = train_data[sample_idx, :].copy()
    peak_value = max(sample.max(), abs(sample.min()))
    is_contam = random.random()<=contam_prob
    if is_contam:
        nbr_anom = random.randint(min_nbr_anom, max_nbr_anom)
        sample_anom_idx = []
        for _ in range(nbr_anom):
            idx = random.randint(0, len(sample)-1)
            sample[idx] = (peak_value+random.random()*peak_value)*-1**random.randint(0, 1)
            sample_anom_idx.append(idx)
        train_contam_data.append(sample)
        train_anom_idx.append(sample_anom_idx)
    else:
        train_contam_data.append(sample)
        train_anom_idx.append([])


In [7]:
test_contam_data = []
test_anom_idx = [] # indices of where the anomalies are for each sequence
for sample_idx in range(test_data.shape[0]):
    sample = test_data[sample_idx, :].copy()
    peak_value = max(sample.max(), abs(sample.min()))
    is_contam = random.random()<=contam_prob
    if is_contam:
        sample_anom_idx = []
        nbr_anom = random.randint(min_nbr_anom, max_nbr_anom)
        for _ in range(nbr_anom):
            idx = random.randint(0, len(sample)-1)
            sample[idx] = (peak_value+random.random()*peak_value)*-1**random.randint(0, 1)
            sample_anom_idx.append(idx)
        test_contam_data.append(sample)
        test_anom_idx.append(sample_anom_idx)
    else:
        test_contam_data.append(sample)
        test_anom_idx.append([])

Saving dataset

In [8]:
def delete_files_in_directory(directory_path):
    try:
        for root, _, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                os.remove(file_path)
        print(f"All files in '{directory_path}' have been deleted.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

for path in ["data/aemo_npy_data/", "data/filtered/"]:
    delete_files_in_directory(path)

All files in 'data/aemo_npy_data/' have been deleted.
All files in 'data/filtered/' have been deleted.


In [10]:
dataset_root = "data/aemo_npy_data"

# for i, sample in enumerate(train_data): # clean train data
#     np.save(os.path.join(dataset_root, "train", "data", str(i)), sample)

for i, (sample, anom_idx) in enumerate(zip(train_contam_data, train_anom_idx)): # contaminated train data
    np.save(os.path.join(dataset_root, "train", "data",str(i)), sample)
    np.save(os.path.join(dataset_root, "train", "gt", str(i)), anom_idx)

for i, (sample, anom_idx) in enumerate(zip(test_contam_data, test_anom_idx)):   # contaminated test data
    np.save(os.path.join(dataset_root, "test", "data", str(i)), sample)
    np.save(os.path.join(dataset_root, "test", "gt", str(i)), anom_idx)
