In [1]:
import pandas as pd
import glob
from matplotlib import pyplot
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector
from keras.models import Model
from keras import regularizers

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import warnings

from collections import Counter
from tqdm import tqdm
import os
import math

In [2]:
warnings.filterwarnings("ignore")

In [3]:
# function that finds the indexes of non-anomalies for interpolation 
def interpolation_indexes(mylist, mynumber):
    
    left_neighbour = 0
    right_neighbour = 0
    
    # check left neighbour
    if((mynumber - 1) not in mylist):
        left_neighbour = mynumber - 1
    else:
        min_number = mynumber
        while min_number in mylist:
            min_number = min_number - 1
        left_neighbour = min_number
    
    # check right neighbour
    if((mynumber + 1) not in mylist):
        right_neighbour = mynumber + 1
    else:
        max_number = mynumber
        while max_number in mylist:
            max_number = max_number + 1
        right_neighbour = max_number
    
    return left_neighbour, right_neighbour

In [4]:
def lstm_model():
    inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))
    L1 = LSTM(16, activation='relu', return_sequences=True, 
            kernel_regularizer=regularizers.l2(0.00))(inputs)
    L2 = LSTM(8, activation='relu', return_sequences=False)(L1)
    L3 = RepeatVector(X_train.shape[1])(L2)
    L4 = LSTM(8, activation='relu', return_sequences=True)(L3)
    L5 = LSTM(16, activation='relu', return_sequences=True)(L4)
    output = TimeDistributed(Dense(X_train.shape[2]))(L5)    
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='mae')
    return model

In [5]:
def train_anomaly_removal(df_train):
    
    # extract indexes for anomalies
    indexes = list(df_train[df_train.is_anomaly == 1].index)

    # creating a new df that replaces the anomalous samples with interpolation value
    df = pd.DataFrame(columns = df_train.columns)
    for i in range(0, len(df_train)):

        #print(i)

        # add all non-anomalies
        if(df_train.is_anomaly[i] == 0):
            df = df.append({'timestamp' : df_train.timestamp[i], 'value' : df_train.value[i], 'is_anomaly' : df_train.is_anomaly[i]},
            ignore_index = True)

        if((df_train.is_anomaly[i]==1) & (i != (len(df_train)-1)) & (i != 0)):
            if(df_train.is_anomaly[i+1]!=1):
                
                if((interpolation_indexes(indexes, i)[0] != -1) & (interpolation_indexes(indexes, i)[1] != -1)):
                    value_interpolation = (df_train.value[interpolation_indexes(indexes, i)[0]]
                                           +df_train.value[interpolation_indexes(indexes, i)[1]])/2

                    df = df.append({'timestamp' : df_train.timestamp[i], 'value': value_interpolation, 'is_anomaly' : 0.0}, ignore_index = True)
    return df

In [6]:
def threshold_computing_max(X_train):
    X_train_pred = model.predict(X_train, verbose=0)
    train_mae_loss_avg = np.mean(np.abs(X_train_pred - X_train), axis=1)
    max_threshold = np.max(train_mae_loss_avg)
    return max_threshold

In [7]:
def reconstruction_loss_predictions(X_test):
    X_test_pred = model.predict(X_test, verbose=0)
    mae_loss = np.mean(np.abs(X_test_pred-X_test), axis=1)
    return mae_loss, X_test_pred

In [8]:
def predicted_labels(mae_loss, threshold):
    predicted_test_label = []
    for i in range(0, len(test_mae_loss)):
        if(test_mae_loss[i][0]>(threshold)):
            predicted_test_label.append(1)
        else:
            predicted_test_label.append(0)
    return predicted_test_label

In [9]:
# Yahoo
#path_files = '../../../../lorenapoenaru-olaru/Documents/phd_related/data_sets_concept_drift/anomaly_detection/Yahoo_A1Benchmark/'
# NAB
path_files = '../../../../lorenapoenaru-olaru/Documents/phd_related/data_sets_concept_drift/anomaly_detection/NAB/'

## Extract all file names corresponding to time series

In [10]:
ts_names = []
for i in os.listdir(path_files):
    ts_names.append(i)

In [11]:
# Yahoo
#retraining_batches = pd.read_csv('./yahoo_retraining_batches.csv')
# NAB
retraining_batches = pd.read_csv('./nab_retraining_batches.csv')

retraining_batches = retraining_batches.loc[:, ~retraining_batches.columns.str.contains('^Unnamed')]


In [12]:
df_final_results_details = pd.DataFrame(columns = ['TS_name', 'lstmae_reconstruction_loss'])
df_final_results = pd.DataFrame(columns = ['TS_name', 'Labels_True', 'Labels_Pred', 'Test_Size', 'Model'])

scaler = MinMaxScaler()

# for daily retraining NAB
#window_retraining = 288

# for weekly retraining NAB
window_retraining = 2016

# for weekly retraining Yahoo
#window_retraining = 168


for ts_name in tqdm(ts_names):
    
    
    # retraining_batches extraction
    
    str_retraining_batches = list(retraining_batches[retraining_batches.TS_name == ts_name].Retraining_Batches)[0]
    list_str_retraining_batches = str_retraining_batches.split(',')
    list_retraining_batches = []
    for i in range(0, len(list_str_retraining_batches)):
        list_str_retraining_batches[i] = list_str_retraining_batches[i].replace('[', '')
        list_str_retraining_batches[i] = list_str_retraining_batches[i].replace(']', '')
        
        
        #print(list_str_retraining_batches)
        #print(list_str_retraining_batches[0])
        #print(len(list_str_retraining_batches[0]))
        
        if(len(list_str_retraining_batches[0])):
            #print(list_str_retraining_batches[i])
            list_retraining_batches.append(int(list_str_retraining_batches[i]))
        else:
            list_retraining_batches.append(0)
    print(list_retraining_batches)
    
    
    #print('TS NAME', ts_name)
    #print('Retraining Batches', list_retraining_batches)
    
    for n in range(0, 5):
        
        label_pred_complete = []
        losses_complete = []

        #print(ts_name)
        # path to train/test
        filename_nab = path_files+ts_name

        # read ts
        df_nab = pd.read_csv(filename_nab)
        
        # split into train and test
        init_train = df_nab[0:math.floor(len(df_nab)/2)]
        # print(len(init_train))
        init_test = df_nab[math.floor(len(df_nab)/2):]
        # print(len(init_test))
        
        for i in tqdm(range(0, (math.floor(len(init_test)/window_retraining)+1))):
            #print('i = ', i)
            
            if(i==0):
                df_train = init_train
                df_test = init_test[(i*window_retraining):((i+1)*window_retraining)]
                
            #print(df_train)
            #print(df_test)

            
            
            # remove anomalies from train to prepare LSTM
            # all anomalies are replaced by the interpolation of their closest non-anomalous neighbours
            df_train_preprocess = train_anomaly_removal(df_train)


            # final training dataset + labels
            label_train = df_train_preprocess.is_anomaly
            train = df_train_preprocess.value


            # final testing dataset + labels
            label_test = df_test.is_anomaly
            test = df_test.value

            # in case the training contains all data and there is no more data left for testing
            if(df_test.empty):
                break


            # Data preprocessing - Scaling
            # the scaler is fit on the training data and applied on the testing data
            train_scale = scaler.fit_transform(np.array(train).reshape(-1, 1))
            test_nab_scale = scaler.transform(np.array(test).reshape(-1,1))

            # Shape Train Data for LSTM
            X_train = train_scale.reshape(train_scale.shape[0], 1, 1)

            # Train LSTM
            no_epochs = 50
            batch_size = 128
            model = lstm_model()
            encdec = model.fit(X_train, X_train, epochs=no_epochs, batch_size=batch_size,
                                validation_split=0.25).history

            # Threshold computing
            threshold = threshold_computing_max(X_train)

            # Shape Test Data for LSTM
            X_test = test_nab_scale.reshape(test_nab_scale.shape[0], 1, 1)

            test_mae_loss, X_test_pred = reconstruction_loss_predictions(X_test)

            # Extracting Predicted Labels
            y_label_pred = predicted_labels(test_mae_loss, threshold)

            label_pred_complete.append(y_label_pred)
            losses_complete.append(threshold)
            
            
            if((i in list_retraining_batches) and (i<(int(round(len(init_test)/window_retraining)-1)))):
                print('batch has drift')
                #print(i<int(round(len(init_test)/window_retraining)))
                df_train = pd.concat([init_train[(i+1)*window:], init_test[0:(i+1)*window]], ignore_index=True)
            else:
                print('batch does not have drift')
                
            df_test = init_test[((i+1)*window_retraining):((i+2)*window_retraining)]
                

        all_predicted_labels = []
        for i in range(0, len(label_pred_complete)):
            for j in range(0, len(label_pred_complete[i])):
                all_predicted_labels.append(label_pred_complete[i][j])

        # Save Results
        # Save reconstruction Error for each Dataset
        df_results_details = pd.DataFrame()
        df_results_details['TS_name'] = [ts_name]
        df_results_details['lstmae_reconstruction_loss'] = [losses_complete]
        df_results_details['retraining_technique'] = 'sliding_window'
        df_results_details['retraining_window'] = window_retraining
        df_final_results_details = df_final_results_details.append(df_results_details)


        # Save Predicted Labels
        df_results = pd.DataFrame()


        df_results['TS_name'] = [ts_name]
        df_results['retraining_technique'] = 'sliding_window'
        df_results['retraining_window'] = window_retraining
        df_results['Labels_True'] = [list(init_test.is_anomaly)]
        df_results['Labels_Pred'] = [all_predicted_labels]
        df_results['Test_Size'] = len(list(init_test.is_anomaly))
        df_results['Model'] = 'LSTM_AE'
        df_final_results = df_final_results.append(df_results)

  0%|          | 0/17 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

[0, 1, 2, 3, 5, 6]
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



 50%|█████     | 1/2 [00:10<00:10, 10.33s/it][A

batch does not have drift


 50%|█████     | 1/2 [00:13<00:13, 13.95s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



 50%|█████     | 1/2 [00:09<00:09,  9.63s/it][A

batch does not have drift


 50%|█████     | 1/2 [00:13<00:13, 13.28s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



 50%|█████     | 1/2 [00:09<00:09,  9.42s/it][A

batch does not have drift


 50%|█████     | 1/2 [00:13<00:13, 13.02s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



 50%|█████     | 1/2 [00:09<00:09,  9.68s/it][A

batch does not have drift


 50%|█████     | 1/2 [00:13<00:13, 13.31s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



 50%|█████     | 1/2 [00:09<00:09,  9.72s/it][A

batch does not have drift


 50%|█████     | 1/2 [00:13<00:13, 13.35s/it]
  6%|▌         | 1/17 [01:06<17:51, 66.98s/it]
  0%|          | 0/2 [00:00<?, ?it/s][A

[5, 6]
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



 50%|█████     | 1/2 [00:09<00:09,  9.63s/it][A

batch does not have drift


 50%|█████     | 1/2 [00:13<00:13, 13.63s/it]

  0%|          | 0/2 [00:00<?, ?it/s][A

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50



 50%|█████     | 1/2 [00:11<00:11, 11.41s/it][A

batch does not have drift


 50%|█████     | 1/2 [00:15<00:15, 15.57s/it]
  6%|▌         | 1/17 [01:36<25:39, 96.20s/it]


KeyboardInterrupt: 

In [13]:
df_final_results = df_final_results.reset_index(drop=True)
df_final_results_details = df_final_results_details.reset_index(drop=True)

In [14]:
df_final_results

Unnamed: 0,TS_name,Labels_True,Labels_Pred,Test_Size,Model,retraining_technique,retraining_window
0,ec2_cpu_utilization_825cc2.csv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2016,LSTM_AE,sliding_window,2016.0
1,ec2_cpu_utilization_825cc2.csv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2016,LSTM_AE,sliding_window,2016.0
2,ec2_cpu_utilization_825cc2.csv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2016,LSTM_AE,sliding_window,2016.0
3,ec2_cpu_utilization_825cc2.csv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2016,LSTM_AE,sliding_window,2016.0
4,ec2_cpu_utilization_825cc2.csv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2016,LSTM_AE,sliding_window,2016.0
5,ec2_network_in_257a54.csv,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2016,LSTM_AE,sliding_window,2016.0


In [None]:
#df_final_results.to_csv('./results/df_results_lstmae_fedd_sw.csv')