In [None]:
import pandas as pd
import glob
from matplotlib import pyplot
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector
from keras.models import Model
from keras import regularizers

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import warnings

from collections import Counter
from tqdm import tqdm
import os
import math

# RERUN

In [None]:
warnings.filterwarnings("ignore")

In [None]:
# function that finds the indexes of non-anomalies for interpolation 
def interpolation_indexes(mylist, mynumber):
    
    left_neighbour = 0
    right_neighbour = 0
    
    # check left neighbour
    if((mynumber - 1) not in mylist):
        left_neighbour = mynumber - 1
    else:
        min_number = mynumber
        while min_number in mylist:
            min_number = min_number - 1
        left_neighbour = min_number
    
    # check right neighbour
    if((mynumber + 1) not in mylist):
        right_neighbour = mynumber + 1
    else:
        max_number = mynumber
        while max_number in mylist:
            max_number = max_number + 1
        right_neighbour = max_number
    
    return left_neighbour, right_neighbour

In [None]:
def lstm_model():
    inputs = Input(shape=(X_train.shape[1], X_train.shape[2]))
    L1 = LSTM(16, activation='relu', return_sequences=True, 
            kernel_regularizer=regularizers.l2(0.00))(inputs)
    L2 = LSTM(8, activation='relu', return_sequences=False)(L1)
    L3 = RepeatVector(X_train.shape[1])(L2)
    L4 = LSTM(8, activation='relu', return_sequences=True)(L3)
    L5 = LSTM(16, activation='relu', return_sequences=True)(L4)
    output = TimeDistributed(Dense(X_train.shape[2]))(L5)    
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='mae')
    return model

In [None]:
def train_anomaly_removal(df_train):
    
    # extract indexes for anomalies
    indexes = list(df_train[df_train.is_anomaly == 1].index)

    # creating a new df that replaces the anomalous samples with interpolation value
    df = pd.DataFrame(columns = df_train.columns)
    for i in range(0, len(df_train)):

        #print(i)

        # add all non-anomalies
        if(df_train.is_anomaly[i] == 0):
            df = df.append({'timestamp' : df_train.timestamp[i], 'value' : df_train.value[i], 'is_anomaly' : df_train.is_anomaly[i]},
            ignore_index = True)

        if((df_train.is_anomaly[i]==1) & (i != (len(df_train)-1))):
            if(df_train.is_anomaly[i+1]!=1):
                #print(i)
                value_interpolation = (df_train.value[interpolation_indexes(indexes, i)[0]]+df_train.value[interpolation_indexes(indexes, i)[1]])/2

                df = df.append({'timestamp' : df_train.timestamp[i], 'value': value_interpolation, 'is_anomaly' : 0.0},
            ignore_index = True)
    return df

In [None]:
def threshold_computing_max(X_train):
    X_train_pred = model.predict(X_train, verbose=0)
    train_mae_loss_avg = np.mean(np.abs(X_train_pred - X_train), axis=1)
    max_threshold = np.max(train_mae_loss_avg)
    return max_threshold

In [None]:
def reconstruction_loss_predictions(X_test):
    X_test_pred = model.predict(X_test, verbose=0)
    mae_loss = np.mean(np.abs(X_test_pred-X_test), axis=1)
    return mae_loss, X_test_pred

In [None]:
def predicted_labels(mae_loss, threshold):
    predicted_test_label = []
    for i in range(0, len(test_mae_loss)):
        if(test_mae_loss[i][0]>(threshold)):
            predicted_test_label.append(1)
        else:
            predicted_test_label.append(0)
    return predicted_test_label

In [None]:
path_files_kpi_train = '../../../Documents/phd_related/data_sets_concept_drift/anomaly_detection/kpi/train/'
path_files_kpi_test = '../../../Documents/phd_related/data_sets_concept_drift/anomaly_detection/kpi/test/'

## Extract all file names corresponding to time series

In [None]:
ts_names = []
for i in os.listdir(path_files_kpi_train):
    ts_names.append(str(i.split('.csv')[0]))

In [None]:
len(ts_names)

In [None]:
df_final_results_details = pd.DataFrame(columns = ['TS_name', 'lstmae_reconstruction_loss'])
df_final_results = pd.DataFrame(columns = ['TS_name', 'Labels_True', 'Labels_Pred', 'Test_Size', 'Model'])

scaler = MinMaxScaler()

window = 2016

for ts_name in tqdm(ts_names):
    
    label_pred_complete = []
    losses_complete = []
    
    print(ts_name)
    # path to train/test
    filename_kpi_train = path_files_kpi_train+ts_name+".csv"
    filename_kpi_test = path_files_kpi_test+ts_name+".csv"
    
    # read train
    init_train = pd.read_csv(filename_kpi_train)
    init_train = init_train.loc[:, ~init_train.columns.str.contains('^Unnamed')]
    
    # read test
    init_test = pd.read_csv(filename_kpi_test)
    init_test = init_test.loc[:, ~init_test.columns.str.contains('^Unnamed')]
    
    
    for i in tqdm(range(0, (math.floor(len(init_test)/window)+1))):
        
        # adjust training over time
        df_kpi_train = pd.concat([init_train, init_test[0:window*i]], ignore_index=True)
        
        
        # adjust testing over time
        
        if(i == (round(len(init_test)/window))):
            df_kpi_test = init_test[(i)*window:]
        else:
            df_kpi_test = init_test[(i*window):((i+1)*window)]
    
    
        # remove anomalies from train to prepare LSTM
        # all anomalies are replaced by the interpolation of their closest non-anomalous neighbours
        df_train_kpi = train_anomaly_removal(df_kpi_train)


        # final training dataset + labels
        label_train = df_train_kpi.is_anomaly
        train_kpi = df_train_kpi.value
        


        # final testing dataset + labels
        label_test = df_kpi_test.is_anomaly
        test_kpi = df_kpi_test.value
        

        # Data preprocessing - Scaling
        # the scaler is fit on the training data and applied on the testing data
        train_kpi_scale = scaler.fit_transform(np.array(train_kpi).reshape(-1, 1))
        test_kpi_scale = scaler.transform(np.array(test_kpi).reshape(-1,1))

        # Shape Train Data for LSTM
        X_train = train_kpi_scale.reshape(train_kpi_scale.shape[0], 1, 1)

        # Train LSTM
        no_epochs = 50
        batch_size = 128
        model = lstm_model()
        encdec = model.fit(X_train, X_train, epochs=no_epochs, batch_size=batch_size,
                            validation_split=0.25).history

        # Threshold computing
        threshold = threshold_computing_max(X_train)

        # Shape Test Data for LSTM
        X_test = test_kpi_scale.reshape(test_kpi_scale.shape[0], 1, 1)

        test_mae_loss, X_test_pred = reconstruction_loss_predictions(X_test)

        # Extracting Predicted Labels
        y_label_pred = predicted_labels(test_mae_loss, threshold)

        label_pred_complete.append(y_label_pred)
        losses_complete.append(threshold)
    
    all_predicted_labels = []
    for i in range(0, len(label_pred_complete)):
        for j in range(0, len(label_pred_complete[i])):
            all_predicted_labels.append(label_pred_complete[i][j])
    
    
    
    
    # Save Results
    # Save reconstruction Error for each Dataset
    df_results_details = pd.DataFrame()
    df_results_details['TS_name'] = [ts_name]
    df_results_details['lstmae_reconstruction_loss'] = [losses_complete]
    df_results_details['retraining_technique'] = 'full_history'
    df_results_details['retraining_window'] = window
    df_final_results_details = df_final_results_details.append(df_results_details)


    # Save Predicted Labels
    df_results = pd.DataFrame()


    df_results['TS_name'] = [ts_name]
    df_results['retraining_technique'] = 'full_history'
    df_results['retraining_window'] = window
    df_results['Labels_True'] = [list(init_test.is_anomaly)]
    df_results['Labels_Pred'] = [all_predicted_labels]
    df_results['Test_Size'] = len(list(init_test.is_anomaly))
    df_results['Model'] = 'LSTM_AE'
    df_final_results = df_final_results.append(df_results)
    

In [None]:
df_final_results = df_final_results.set_index([pd.Index(np.arange(len(df_final_results))), 'TS_name'])
df_final_results_details = df_final_results_details.set_index([pd.Index(np.arange(len(df_final_results_details))), 'TS_name'])



In [None]:
df_final_results.to_csv('./results/df_results_lstmae_kpi_fh_2016.csv')

In [None]:
df_final_results_details.to_csv('./results/df_results_details_lstmae_kpi_fh_2016.csv')