In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

In [6]:
def prep_data(dataset, config_ts):
    idx_anomaly = []
    for lib in config_ts['annomalies']:
        for key, value in lib.items():
            if key == 'position':
                idx_anomaly.append(value)
    t_unit = 'buisness_day'
    t = []
    readings = []
    i = 0
    
    for index, row in dataset.iterrows():
        if i > 0:
            t.append(i)
            readings.append(float(row))
        i = i + 1
    t = np.asarray(t)
    readings = np.asarray(readings)
    
    return t, t_unit, readings, idx_anomaly

In [7]:
def process_and_save_specified_dataset(config_ts, dataset, idx_split, y_scale=5, save_file=False):
    t, t_unit, readings, idx_anomaly = prep_data(dataset, config_ts)
    
    # split into training and test sets
    training = readings[idx_split[0]:idx_split[1]]
    t_train = t[idx_split[0]:idx_split[1]]
    
    # normalise by training mean and std 
    train_m = np.mean(training)
    train_std = np.std(training)
    print("\nTraining set mean is {}".format(train_m))
    print("Training set std is {}".format(train_std))
    readings_normalised = (readings - train_m) / train_std
    
    training = readings_normalised[idx_split[0]:idx_split[1]]
    if idx_split[0] == 0:
        test = readings_normalised[idx_split[1]:]
        t_test = t[idx_split[1]:] - idx_split[1]
        idx_anomaly_test = np.asarray(idx_anomaly) - idx_split[1]
    else:
        test = [readings_normalised[:idx_split[0]], readings_normalised[idx_split[1]:]]
        t_test = [t[:idx_split[0]], t[idx_split[1]:] - idx_split[1]]
        idx_anomaly_split = np.squeeze(np.argwhere(np.asarray(idx_anomaly)>idx_split[0]))
        idx_anomaly_test = [np.asarray(idx_anomaly[:idx_anomaly_split[0]]), 
                            np.asarray(idx_anomaly[idx_anomaly_split[0]:]) - idx_split[1]]
    idx_anomaly_f_test = []
    for idx in idx_anomaly_test:
        if idx > 0 :
             idx_anomaly_f_test.append(idx)  
    idx_anomaly_f_test = np.array(idx_anomaly_f_test)
    print("Anomaly indices in the test set are {}".format(idx_anomaly_f_test))

    data = {}
    data['t'] = t
    data['t_unit'] = t_unit
    data['readings'] = readings
    data['idx_anomaly'] = idx_anomaly
    data['idx_split'] = idx_split
    data['training'] = training
    data['test'] = test
    data['train_m'] = train_m
    data['train_std'] = train_std
    data['t_train'] = t_train
    data['t_test'] = t_test
    data['idx_anomaly_test'] = idx_anomaly_f_test
    
    # plot the whole normalised sequence
    fig, axs = plt.subplots(1, 1, figsize=(18, 4), edgecolor='k')
    fig.subplots_adjust(hspace=.4, wspace=.4)
    axs.plot(t, readings_normalised)
    if idx_split[0] == 0:
        axs.plot(idx_split[1]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'b--')
    else:
        for i in range(2):
            axs.plot(idx_split[i]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'b--')
    for j in range(len(idx_anomaly)):
        axs.plot(idx_anomaly[j]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'r--')
    axs.grid(True)
    axs.set_xlim(0, len(t))
    axs.set_ylim(-y_scale, y_scale)
    axs.set_xlabel("timestamp (every {})".format(t_unit))
    axs.set_ylabel("normalised readings")
    axs.set_title("{} dataset\n(normalised by train mean {:.2f} and std {:.2f})".format(dataset, train_m, train_std))
    axs.legend(('data', 'train test set split', 'anomalies'))
    
    return t, readings_normalised, data