In [10]:
import requests
import os
from pathlib import Path
import pickle
from shutil import unpack_archive
import numpy as np
import io
import matplotlib.pylab as plt
from matplotlib.pyplot import plot, ion, show, savefig, cla, figure

In [2]:
urls = dict()
urls['ecg']=['http://www.cs.ucr.edu/~eamonn/discords/ECG_data.zip',
             'http://www.cs.ucr.edu/~eamonn/discords/mitdbx_mitdbx_108.txt',
             'http://www.cs.ucr.edu/~eamonn/discords/qtdbsele0606.txt',
             'http://www.cs.ucr.edu/~eamonn/discords/chfdbchf15.txt',
             'http://www.cs.ucr.edu/~eamonn/discords/qtdbsel102.txt']
urls['gesture']=['http://www.cs.ucr.edu/~eamonn/discords/ann_gun_CentroidA']
urls['space_shuttle']=['http://www.cs.ucr.edu/~eamonn/discords/TEK16.txt',
                       'http://www.cs.ucr.edu/~eamonn/discords/TEK17.txt',
                       'http://www.cs.ucr.edu/~eamonn/discords/TEK14.txt']
urls['respiration']=['http://www.cs.ucr.edu/~eamonn/discords/nprs44.txt',
                     'http://www.cs.ucr.edu/~eamonn/discords/nprs43.txt']
urls['power_demand']=['http://www.cs.ucr.edu/~eamonn/discords/power_data.txt']


In [3]:
def download_dataset(urls):
    for dataname in urls:
        raw_dir = Path('datasets', dataname, 'raw')
        raw_dir.mkdir(parents=True, exist_ok=True)
        for url in urls[dataname]:
            filename = raw_dir.joinpath(Path(url).name)
            print('Downloading', url)
            resp =requests.get(url)
            filename.write_bytes(resp.content)
            if filename.suffix=='':
                filename.rename(filename.with_suffix('.txt'))
            print('Saving to', filename.with_suffix('.txt'))
            if filename.suffix=='.zip':
                print('Extracting to', filename)
                unpack_archive(str(filename), extract_dir=str(raw_dir))


In [4]:
#download_dataset(urls)

In [5]:
# this function load one .cvs (a sequence)
def load_data_ori(dataset, dataset_folder='datasets'):
    raw_dir = Path('datasets', dataset, 'raw')
    readings = {}
    idx_anomaly = {}
    t = {}
    t_unit = {}
    for filepath in raw_dir.glob('*.txt'):
        with open(str(filepath)) as f:
            #f = open(str(filepath))
            #print("shape of f:", np.array(f.read()).shape)
            for i, line in enumerate(f):
                tokens = []#[float(token) for token in line.split()]
                if raw_dir.parent.name == 'ecg':
                    tokens.pop(0)
                if filepath.name == 'chfdbchf15.txt':
                    tokens.append(1.0) if 2250 < i < 2400 else tokens.append(0.0)
                elif filepath.name == 'xmitdb_x108_0.txt':
                    tokens.append(1.0) if 4020 < i < 4400 else tokens.append(0.0)
                elif filepath.name == 'mitdb__100_180.txt':
                    tokens.append(1.0) if 1800 < i < 1990 else tokens.append(0.0)
                elif filepath.name == 'chfdb_chf01_275.txt':
                    tokens.append(1.0) if 2330 < i < 2600 else tokens.append(0.0)
                elif filepath.name == 'ltstdb_20221_43.txt':
                    tokens.append(1.0) if 650 < i < 780 else tokens.append(0.0)
                elif filepath.name == 'ltstdb_20321_240.txt':
                    tokens.append(1.0) if 710 < i < 850 else tokens.append(0.0)
                elif filepath.name == 'chfdb_chf13_45590.txt':
                    tokens.append(1.0) if 2800 < i < 2960 else tokens.append(0.0)
                elif filepath.name == 'stdb_308_0.txt':
                    tokens.append(1.0) if 2290 < i < 2550 else tokens.append(0.0)
                elif filepath.name == 'qtdbsel102.txt':
                    tokens.append(1.0) if 4230 < i < 4430 else tokens.append(0.0)
                elif filepath.name == 'ann_gun_CentroidA.txt':
                    tokens.append(1.0) if 2070 < i < 2810 else tokens.append(0.0)
                elif filepath.name == 'TEK16.txt':
                    tokens.append(1.0) if 4270 < i < 4370 else tokens.append(0.0)
                elif filepath.name == 'TEK17.txt':
                    tokens.append(1.0) if 2100 < i < 2145 else tokens.append(0.0)
                elif filepath.name == 'TEK14.txt':
                    tokens.append(1.0) if 1100 < i < 1200 or 1455 < i < 1955 else tokens.append(0.0)
                elif filepath.name == 'nprs44.txt':
                    tokens.append(1.0) if 16192 < i < 16638 or 20457 < i < 20911 else tokens.append(0.0)
                elif filepath.name == 'nprs43.txt':
                    tokens.append(1.0) if 12929 < i < 13432 or 14877 < i < 15086 or 15729 < i < 15924 else tokens.append(0.0)
                elif filepath.name == 'power_data.txt':
                    tokens.append(1.0) if 8254 < i < 8998 or 11348 < i < 12143 or 33883 < i < 34601 else tokens.append(0.0)
                try:
                    readings[filepath.name].append(line)
                    if tokens[-1] == 1.0:
                        idx_anomaly[filepath.name].append(i)
                except:
                    readings[filepath.name] = [tokens[:-1]]
                    if tokens[-1] == 1.0:
                        idx_anomaly[filepath.name] = [i]#indent below
            print("readings of",filepath.name, "shape:",np.shape(readings[filepath.name]))        
            readings[filepath.name] = np.asarray(readings[filepath.name])
            idx_anomaly[filepath.name] = np.asarray(idx_anomaly[filepath.name])
            t_unit[filepath.name] = 'time_unit'
            t[filepath.name] = np.array(range(readings[filepath.name].shape[0]))
    return t, t_unit, idx_anomaly, readings
#     if dataset == 'ambient_temp':
#         data_file = os.path.join(csv_folder, 'ambient_temperature_system_failure.csv')
#         anomalies = ['2013-12-22 20:00:00', '2014-04-13 09:00:00']
#         t_unit = 'hour'
#     elif dataset == 'cpu_utilization':
#         data_file = os.path.join(csv_folder, 'cpu_utilization_asg_misconfiguration.csv')
#         anomalies = ['2014-07-12 02:04:00', '2014-07-14 21:44:00']
#         t_unit = '5 min'
#     elif dataset == 'ec2_request':
#         data_file = os.path.join(csv_folder, 'ec2_request_latency_system_failure.csv')
#         anomalies = ['2014-03-14 09:06:00', '2014-03-18 22:41:00', '2014-03-21 03:01:00']
#         t_unit = '5 min'
#     elif dataset == 'machine_temp':
#         data_file = os.path.join(csv_folder, 'machine_temperature_system_failure.csv')
#         anomalies = ['2013-12-11 06:00:00', '2013-12-16 17:25:00', '2014-01-28 13:55:00', '2014-02-08 14:30:00']
#         t_unit = '5 min'
#     elif dataset == 'rogue_agent_key_hold':
#         data_file = os.path.join(csv_folder, 'rogue_agent_key_hold.csv')
#         anomalies = ['2014-07-15 08:30:00', '2014-07-17 09:50:00']
#         t_unit = '5 min'
#     elif dataset == 'rogue_agent_key_updown':
#         data_file = os.path.join(csv_folder, 'rogue_agent_key_updown.csv')
#         anomalies = ['2014-07-15 04:00:00', '2014-07-17 08:50:00']
#         t_unit = '5 min'
#     elif dataset == 'nyc_taxi':
#         data_file = os.path.join(csv_folder, 'nyc_taxi.csv')
#         anomalies = ['2014-11-01 19:00:00', '2014-11-27 15:30:00', '2014-12-25 15:00:00', '2015-01-01 01:00:00', 
#                      '2015-01-27 00:00:00']
#         t_unit = '30 min'
    
#     t = []
#     readings = []
#     idx_anomaly = []
#     i = 0
#     with open(data_file) as csvfile:
#         readCSV = csv.reader(csvfile, delimiter=',')
#         print("\n--> Anomalies occur at:")
#         for row in readCSV:
#             if i > 0:
#                 t.append(i)
#                 readings.append(float(row[1]))
#                 for j in range(len(anomalies)):
#                     if row[0] == anomalies[j]:
#                         idx_anomaly.append(i)
#                         print("  timestamp #{}: {}".format(j, row[0]))
#             i = i + 1
#     t = np.asarray(t)
#     readings = np.asarray(readings)
#     print("\nOriginal csv file contains {} timestamps.".format(t.shape))
#     print("Processed time series contain {} readings.".format(readings.shape))
#     print("Anomaly indices are {}".format(idx_anomaly))
    
#     return t, t_unit, readings, idx_anomaly

In [25]:
def load_data(dataset, dataset_folder='datasets'):
    raw_dir = Path('datasets', dataset, 'raw')
    readings = {}
    idx_anomaly = {}
    t = {}
    t_unit = {}
    for filepath in raw_dir.glob('*.txt'):
            f = np.loadtxt(str(filepath))
            if raw_dir.parent.name == 'ecg':
                f = f[1:]
            print(f.shape)
            readings[filepath.name] = f
            #f = open(str(filepath))
            #print("shape of f:", np.array(f.read()).shape)
            for i in range(f.shape[0]):
                if filepath.name == 'chfdbchf15.txt':
                    #if 2250 < i < 2400: 
                    idx_anomaly[filepath.name] = np.array(range(2251,2400))
                elif filepath.name == 'xmitdb_x108_0.txt':
                    #if 4020 < i < 4400:
                    idx_anomaly[filepath.name] = np.array(range(4021,4400)) 
                elif filepath.name == 'mitdb__100_180.txt':
                    #if 1800 < i < 1990:
                    idx_anomaly[filepath.name] = np.array(range(1801,1990)) 
                elif filepath.name == 'chfdb_chf01_275.txt':
                    #if 2330 < i < 2600:
                    idx_anomaly[filepath.name] = np.array(range(2331,2600))  
                elif filepath.name == 'ltstdb_20221_43.txt':
                    #if 650 < i < 780: 
                    idx_anomaly[filepath.name] = np.array(range(651,780)) 
                elif filepath.name == 'ltstdb_20321_240.txt':
                    #if 710 < i < 850: 
                    idx_anomaly[filepath.name] = np.array(range(711,850)) 
                elif filepath.name == 'chfdb_chf13_45590.txt':
                    #if 2800 < i < 2960: 
                    idx_anomaly[filepath.name] = np.array(range(2801,2960)) 
                elif filepath.name == 'stdb_308_0.txt':
                    #if 2290 < i < 2550: 
                    idx_anomaly[filepath.name] = np.array(range(2291,2550))
                elif filepath.name == 'qtdbsel102.txt':
                    #if 4230 < i < 4430:
                    idx_anomaly[filepath.name] = np.array(range(4231,4430))   
                elif filepath.name == 'ann_gun_CentroidA.txt':
                    #if 2070 < i < 2810:
                    idx_anomaly[filepath.name] = np.array(range(2071,2810))  
                elif filepath.name == 'TEK16.txt':
                    #if 4270 < i < 4370: 
                    idx_anomaly[filepath.name] = np.array(range(4271,4370)) 
                elif filepath.name == 'TEK17.txt':
                    #if 2100 < i < 2145: 
                    idx_anomaly[filepath.name] = np.array(range(2101,2145)) 
                elif filepath.name == 'TEK14.txt':
                    #if 1100 < i < 1200 or 1455 < i < 1955:
                    idx_anomaly[filepath.name] = np.append(np.array(range(1101,1200)),np.array(range(1456,1955))) 
                elif filepath.name == 'nprs44.txt':
                    #if 16192 < i < 16638 or 20457 < i < 20911:
                    idx_anomaly[filepath.name] = np.append(np.array(range(16193,16638)),np.array(range(20458,20911)))   
                elif filepath.name == 'nprs43.txt':
                    #if 12929 < i < 13432 or 14877 < i < 15086 or 15729 < i < 15924:
                    idx_anomaly[filepath.name] = np.append(np.array(range(12930,13432)),np.append(np.array(range(14878,15086)),np.array(range(15730,15924))))
                elif filepath.name == 'power_data.txt':
                    #if 8254 < i < 8998 or 11348 < i < 12143 or 33883 < i < 34601:
                    idx_anomaly[filepath.name] = np.append(np.array(range(8255,8998)),np.append(np.array(range(11349,12143)),np.array(range(33884,34601))))   

            #print("readings of",filepath.name, "shape:",np.shape(readings[filepath.name]))        
            readings[filepath.name] = np.asarray(readings[filepath.name])
            idx_anomaly[filepath.name] = np.asarray(idx_anomaly[filepath.name])
            t_unit[filepath.name] = 'time_unit'
            t[filepath.name] = np.array(range(readings[filepath.name].shape[0]))
    return t, t_unit, idx_anomaly, readings

In [24]:
np.append(np.array(range(8255,8998)),np.append(np.array(range(11349,12143)),np.array(range(33884,34601))))   

array([ 8255,  8256,  8257, ..., 34598, 34599, 34600])

In [7]:
load_data('space_shuttle')
#print(reads)


(5000,)
(5000,)
(5000,)


({'TEK14.txt': array([   0,    1,    2, ..., 4997, 4998, 4999]),
  'TEK16.txt': array([   0,    1,    2, ..., 4997, 4998, 4999]),
  'TEK17.txt': array([   0,    1,    2, ..., 4997, 4998, 4999])},
 {'TEK14.txt': 'time_unit',
  'TEK16.txt': 'time_unit',
  'TEK17.txt': 'time_unit'},
 {'TEK14.txt': array([1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 1110, 1111,
         1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 1120, 1121, 1122,
         1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133,
         1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144,
         1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155,
         1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166,
         1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177,
         1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188,
         1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199,
         1456, 1457, 1458, 1459

In [39]:
# This function plots a dataset with the train/test split and known anomalies
# Relies on helper function load_data()

def process_and_save_specified_dataset(dataset, y_scale=5, save_file=False):
    t, t_unit, all_idx_anomaly, all_readings = load_data(dataset)
    all_readings_normalised = {}
    for key in all_readings.keys():
        readings = all_readings[key]
        print(readings.shape)
        idx_anomaly = all_idx_anomaly[key]
        # split into training and test sets

        idx_train = idx_test = []
        if key == 'chfdb_chf13_45590.txt':
            idx_train = [0, 2439]
            idx_test = [2439, 3726]
        elif key == 'chfdb_chf01_275.txt':
            idx_train = [0, 1833]
            idx_test = [1833, 3674]
        elif key == 'chfdbchf15.txt':
            idx_train = [3381, 14244]
            idx_test = [33, 3381]
        elif key == 'qtdbsel102.txt':
            idx_train = [10093, 44828]
            idx_test = [211, 10093]
        elif key == 'mitdb__100_180.txt':
            idx_train = [2328, 5271]
            idx_test = [73, 2328]
        elif key == 'stdb_308_0.txt':
            idx_train = [2986, 5359]
            idx_test = [265, 2986]
        elif key == 'ltstdb_20321_240.txt':
            idx_train = [1520, 3531]
            idx_test = [73, 1520]
        elif key == 'xmitdb_x108_0.txt':
            idx_train = [424, 3576]
            idx_test = [3576, 5332]
        elif key == 'ltstdb_20221_43.txt':
            idx_train = [1121, 3731]
            idx_test = [0, 1121]
        elif key == 'ann_gun_CentroidA.txt':
            idx_train = [3000, len(readings)]
            idx_test = [0, 3000]
        elif key == 'nprs44.txt':
            idx_train = [363, 12955]
            idx_test = [12955, 24082]
        elif key == 'nprs43.txt':
            idx_train = [4285, 10498]
            idx_test = [10498, 17909]
        elif key == 'power_data.txt':
            idx_train = [15287, 33432]
            idx_test = [501, 15287]
        elif key == 'TEK17.txt':
            idx_train = [2469, 4588]
            idx_test = [1543, 2469]
        elif key == 'TEK16.txt':
            idx_train = [521, 3588]
            idx_test = [3588, 4539]
        elif key == 'TEK14.txt':
            idx_train = [2089, 4098]
            idx_test = [97, 2089]

        training = readings[idx_train[0]:idx_train[1]]
        # normalise by training mean and std 
        train_m = np.mean(training, axis=0)
        train_std = np.std(training, axis=0)
        readings_normalised = (readings - train_m) / train_std
        print(key, train_m, train_m.shape, train_std, train_std.shape, readings_normalised[:20])

        training = readings_normalised[idx_train[0]:idx_train[1]]
        test = readings_normalised[idx_test[0]:idx_test[1]]
        idx_anomaly_test = idx_anomaly - idx_test[0] #+ idx_test[0] + 1?
        #print(idx_anomaly_test)

        if save_file:
            save_dir = './datasets/{}/processed/'.format(dataset) 
            if not os.path.isdir(save_dir):
                os.makedirs(save_dir)
            np.savez(save_dir+key.replace('txt', 'npz'), t=t, t_unit = t_unit, readings=readings, idx_anomaly=idx_anomaly,
                        training=training, test=test, train_m=train_m, train_std=train_std,
                        idx_anomaly_test=idx_anomaly_test)
            print("\nProcessed time series are saved at {}".format(save_dir+key.replace('txt', 'npz')))
        else:
            print("\nProcessed time series are not saved.")
        all_readings_normalised[key] = readings_normalised

         # plot the whole normalised sequence
        fig, axs = plt.subplots(1, 1, figsize=(18, 4), edgecolor='k')
        fig.subplots_adjust(hspace=.4, wspace=.4)
#         # axs = axs.ravel()
#         # for i in range(4):
#        print(readings_normalised.shape,t.shape)
        axs.plot(t[key], readings_normalised)
        if idx_train[0] == 0:
            axs.plot(idx_train[1]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'b--')
        else:
            for i in range(2):
                axs.plot(idx_train[i]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'b--')
        for j in range(len(idx_anomaly)):
            axs.plot(idx_anomaly[j]*np.ones(20), np.linspace(-y_scale,y_scale,20), 'r--')
#         #     axs.plot(data[:,1])
        axs.grid(True)
        axs.set_xlim(0, len(t[key]))
        axs.set_ylim(-y_scale, y_scale)
        axs.set_xlabel("timestamp (every {})".format(t_unit[key]))
        axs.set_ylabel("normalised readings")
        axs.set_title("{} dataset\n(normalised by train mean {:.2f} and std {:.2f})".format(dataset, np.mean(train_m), np.mean(train_std)))
        axs.legend(('data', 'train test set split', 'anomalies'))

    return all_readings_normalised

In [1]:
process_and_save_specified_dataset('space_shuttle', save_file=False)

NameError: name 'process_and_save_specified_dataset' is not defined