Copyright (C) Microsoft Corporation. All rights reserved.​

Microsoft Corporation ("Microsoft") grants you a nonexclusive, perpetual, royalty-free right to use, copy, and modify the software code provided by us ("Software Code"). You may not sublicense the Software Code or any use of it (except to your affiliates and to vendors to perform work on your behalf) through distribution, network access, service agreement, lease, rental, or otherwise. This license does not purport to express any claim of ownership over data you may have shared with Microsoft in the creation of the Software Code. Unless applicable law gives you more rights, Microsoft reserves all other rights not expressly granted herein, whether by implication, estoppel or otherwise.

THE SOFTWARE CODE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL MICROSOFT OR ITS LICENSORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

In [3]:
import sys
sys.path.insert(2, '../utils/')

In [4]:
from net import *
from utils import *
import tqdm
import torch
import math
import os
import time
import json
import pandas as pd
import numpy as np
from msanomalydetector.util import average_filter
from competition_metric import evaluate_for_all_series

# Functions

## Generate Data

In [6]:
class gen():
    def __init__(self, win_siz, step, nums):
        self.control = 0
        self.win_siz = win_siz
        self.step = step
        self.number = nums

    def generate_train_data(self, value, back_k=5):
        def normalize(a):
            amin = np.min(a)
            amax = np.max(a)
            a = (a - amin) / (amax - amin + 1e-5)
            return 3 * a

        if back_k <= 5:
            back = back_k
        else:
            back = 5
        length = len(value)
        tmp = []
        for pt in range(self.win_siz, length - back, self.step):
            head = max(0, pt - self.win_siz)
            tail = min(length - back, pt)
            data = np.array(value[head:tail])
            data = data.astype(np.float64)

            # print('Initial', data)
            data = normalize(data)
            # print('Normalized', data)
            num = np.random.randint(1, self.number)
            ids = np.random.choice(self.win_siz, num, replace=False)
            lbs = np.zeros(self.win_siz, dtype=np.int64)
            if (self.win_siz - 6) not in ids:
                self.control += np.random.random()
            else:
                self.control = 0
            if self.control > 100:
                ids[0] = self.win_siz - 6
                self.control = 0
            mean = np.mean(data)
            dataavg = average_filter(data)
            var = np.var(data)
            for id in ids:
                data[id] += (dataavg[id] + mean) * np.random.randn() * min((1 + var), 10)
                lbs[id] = 1
            tmp.append([data.tolist(), lbs.tolist()])
        return tmp


def auto(dic):
    path_auto = os.getcwd() + '/auto.json'
    auto = {}
    for item, value in dic:
        if value != None:
            auto[item] = value
    with open(path_auto, 'w+') as f:
        json.dump(auto, f)


def get_path(dataset_path, t):
    files_path = []
    if t == 'data_train' or t == 'data_test':
        dir_data = os.getcwd() + '/' + dataset_path + '/' + t
        files = os.listdir(dir_data)
        files_path += [dir_data + '/' + f for f in files if not str(f).endswith('.gitkeep')]
    else:
        print('Invalid option')
    return files_path

## Train

In [7]:
def auto(epoch):
    path_auto = os.getcwd() + '/auto.json'
    with open(path_auto, 'r+') as f:
        store = json.load(f)
    data = store['data']
    window = store['window']
    store['epoch'] = epoch
    with open(path_auto, 'w+') as f:
        json.dump(store, f)
    return data, window

## Evaluate

In [8]:
def auto():
    path_auto = os.getcwd() + '/auto.json'
    with open(path_auto, 'r+') as f:
        store = json.load(f)
    window = store['window']
    epoch = store['epoch']
    return window, epoch


def getfid(path):
    return path.split('/')[-1]


#def get_score(files, thres, option):
def get_score(df_test, ts, thres, option):


    total_time = 0
    results = []
    savedscore = []


    in_timestamp = df_test['timestamp']
    in_value = df_test['value']
    in_label = df_test['is_anomaly']


    length = len(in_timestamp)


    if model == 'sr_cnn' and len(in_value) < window:
        print("length is shorter than win_size", len(in_value), window)
    time_start = time.time()
    timestamp, label, pre, scores = models[model](np.array(in_timestamp), np.array(in_value), np.array(in_label),
                                                  window, net, option, thres)
    

    
    time_end = time.time()
    total_time += time_end - time_start
    results.append([timestamp, label, pre, ts])
    savedscore.append([label, scores, ts, timestamp])
    return total_time, results, savedscore

# Main

#### Parser Values



In [9]:
# Yahoo
data_train = '../datasets/data_yahoo/'

window = 64 # window_nab = 64, window_yahoo = 64
step = 8 # step_nab = 8, step_yahoo = 8
seed = 54321
num = 10

In [10]:
lr = 1e-5 # lr_nab = 1e-5, lr_yahoo = 1e-6
load = False
save = 'snapshot'
epoch = 10
batch_size = 256
num_workers = 8
model = 'sr_cnn'
auto = False

In [11]:
# Yahoo
data_test = '../datasets/data_yahoo/'

epoch = 10
model_path_snapshot = 'snapshot'
delay = 0
thres = 0.95
auto = False
missing_option = 'anomaly'

In [20]:
ts_names_train = get_path(data_train, 'data_train')


In [21]:
ts_names_test = get_path(data_test, 'data_test')


In [22]:
ts_names = []
for i in os.listdir(data_test+'/data_train/'):
    ts_names.append(str(i))

In [26]:
# for daily retraining NAB
#window_retraining = 288

# for weekly retraining NAB
#window_retraining = 2016

# for weekly retraining Yahoo
window_retraining = 168



dataset_split_dict = {}

for ts in tqdm.tqdm(range(0, len(ts_names_train))):
    

    begin_train_timestamp = []
    end_train_timestamp = []

    begin_test_timestamp = []
    end_test_timestamp = []

    init_train = pd.read_csv(ts_names_train[ts])
    init_test = pd.read_csv(ts_names_test[ts])
    


    for i in range(0, (math.floor(len(init_test)/window_retraining)+1)):
        

        if(i == 0):
            df_train = init_train
        else:
            df_train = pd.concat([init_train[i*window_retraining:], init_test[0:i*window_retraining]], ignore_index=True)

        if(i == (round(len(init_test)/window_retraining))):
            df_test = init_test[(i)*window_retraining:]
        else:
            df_test = init_test[(i*window_retraining):((i+1)*window_retraining)]
            
        
       
        
        if(len(df_test)):
            begin_train_timestamp.append(df_train.iloc[0].timestamp)
            end_train_timestamp.append(df_train.iloc[len(df_train)-1].timestamp)
            begin_test_timestamp.append(df_test.iloc[0].timestamp)
            end_test_timestamp.append(df_test.iloc[len(df_test)-1].timestamp)
    
    

    begin_end_train_test_timestamps = []
    for j in range(0, len(begin_train_timestamp)):
        begin_end_train_test_timestamp = (begin_train_timestamp[j], end_train_timestamp[j], begin_test_timestamp[j], end_test_timestamp[j])
        begin_end_train_test_timestamps.append(begin_end_train_test_timestamp)
    begin_end_train_test_timestamps
    
    
    
    data_dict = {ts_names_train[ts].split('/')[len(ts_names_train[ts].split('/'))-1] : begin_end_train_test_timestamps}

    dataset_split_dict.update(data_dict)

100%|██████████| 17/17 [00:00<00:00, 28.86it/s]


### Get Maximum of Splits over Time

In [27]:
ts_names = list(dataset_split_dict.keys())
lengths_splits = []
for ts_name in ts_names:
    lengths_splits.append(len(dataset_split_dict[ts_name]))
#print('MAX LEN', np.max(lengths_splits))
#print('MIN LEN', np.min(lengths_splits))

### Duplicate Last Splits in Case Len < max(lenghts_splits)

In [28]:
for ts_name in ts_names:
    if(len(dataset_split_dict[ts_name])<np.max(lengths_splits)):
        #print(ts_name)
        while(len(dataset_split_dict[ts_name])<np.max(lengths_splits)):
            dataset_split_dict[ts_name].append(dataset_split_dict[ts_name][len(dataset_split_dict[ts_name])-1])
            #print(dataset_split_dict[ts_name])


In [29]:
lengths_splits = []
for ts_name in ts_names:
    lengths_splits.append(len(dataset_split_dict[ts_name]))
#lengths_splits

# SRCNN 

In [30]:
# Yahoo
path_merged = '../datasets/Yahoo_A1Benchmark/'
# NAB
#path_merged = '../datasets/NAB/'

ts_names = []
for i in os.listdir(path_merged):
    ts_names.append(str(i))

In [33]:
generator = gen(window, step, num)
model_update = 'sw'

final_results = []
final_delay = []
final_f1_score = []
final_precision = []
final_recall = []
final_TP = []
final_FP = []
final_TN = []
final_FN = []
final_seed = []

df_results_final_seeds = pd.DataFrame(columns=['Seeds', 'Retraining_Technique', 'Delay', 'Results', 
                                        'F1_score', 'Precision', 'Recall', 'TP', 'FP', 'TN', 'FN'])
df_results_final_seeds = df_results_final_seeds.reset_index(drop = True)


for seed in range(0,5):
    
    #print('SEED: ', seed)    
    final_seed.append(seed)



    results_corrected_per_split = []

    results_per_split = []

    split_number_all = []
    ts_all = []

    results_all = []
    total_fscore_all = []
    pre_all = []
    rec_all = []
    TP_all = []
    FP_all = []
    TN_all = []
    FN_all = []


    labels_pred = []
    labels_true = []


    for i in range(0, np.max(lengths_splits)):
        #print('Split Number: ', i)
        split_number_all.append(i)
        
        
        # GENERATE DATA FOR EACH TS

        results = []
        total_time = 0

        for ts_name in tqdm.tqdm(ts_names):
            #print('Time Series Name: ', ts_name)


            merged_df = pd.read_csv(path_merged + ts_name)
            
            for j in range(0, len(merged_df)):
                merged_df.timestamp[j] = float(j)

            index_train_start = merged_df[merged_df.timestamp == dataset_split_dict[ts_name][i][0]].index
            index_train_end = merged_df[merged_df.timestamp == dataset_split_dict[ts_name][i][1]].index

            index_test_start = merged_df[merged_df.timestamp == dataset_split_dict[ts_name][i][2]].index
            index_test_end = merged_df[merged_df.timestamp == dataset_split_dict[ts_name][i][3]].index
            



            train = merged_df[index_train_start[0]:index_test_start[0]]
            test = merged_df[index_test_start[0]:index_test_end[0]]


            # GENERATE DATA



            in_value = train['value']
            #print(len(in_value))

            if len(in_value) < window:
                print("value's length < window size", len(in_value), window)
                continue

            time_start = time.time()
            train_data = generator.generate_train_data(in_value)
            #print('TRAIN', len(train_data))
            time_end = time.time()
            total_time += time_end - time_start
            results += train_data

        #print('file num:', len(ts_names))
        #print('total fake data size:', len(results))


        train_data_path = os.getcwd() + '/' + data_train + '_' + str(window) + '_split_no_' + str(i) + '_train.json'

        with open(train_data_path, 'w+') as f:
            print(train_data_path)
            json.dump(results, f)


        # TRAIN

        if auto:
            data_train, window = auto(epoch)
        else:
            data_train, window = data_train, window
        torch.manual_seed(seed)
        np.random.seed(seed)
        models = {
            'sr_cnn': sr_cnn,
        }
        model = model
        root_path = os.getcwd()

        train_data_path = root_path + '/' + data_train + '_' + str(window) + '_split_no_' + str(i) + '_train.json'
        model_path = root_path + '/' + save + '/'
        if load:
            load_path = root_path + '/' + load
        else:
            load_path = None

        total_time = 0
        time_start = time.time()
        models[model](train_data_path, model_path, window, lr, epoch, batch_size, num_workers,
                      load_path=load_path, model_update = model_update, update_split = i)
        time_end = time.time()
        total_time += time_end - time_start
        #print('time used for training:', total_time, 'seconds')

        

        # EVALUATE

        if auto:
            window, epoch = auto()
        else:
            window = window
            epoch = epoch
        delay = delay
        model = model

        path_snapshot = '../'

        root = os.getcwd()
        print(data, window, epoch)
        models = {
            'sr_cnn': sr_cnn_eval,
        }

        model_path = path_snapshot + '/' + model_path_snapshot + '/srcnn_retry' + '_' + model_update + '_' + str(i) + '_' + str(epoch) + '_' + str(window) + '.bin'

        srcnn_model = Anomaly(window)
        net = load_model(srcnn_model, model_path)

        results_all = []


        for ts_name in tqdm(ts_names):

            ts_all.append(ts_name)

            merged_df = pd.read_csv(path_merged + ts_name)
            
            for j in range(0, len(merged_df)):
                merged_df.timestamp[j] = float(j)
            

            index_test_start = merged_df[merged_df.timestamp == dataset_split_dict[ts_name][i][2]].index
            index_test_end = merged_df[merged_df.timestamp == dataset_split_dict[ts_name][i][3]].index
            


            test = merged_df[index_test_start[0]:index_test_end[0]+1]

            total_time, results, savedscore = get_score(test, ts_name, thres, missing_option)




            results_all.append(results)

        #print('LEN results_all: ', len(results_all))


        #total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(results_corrected, delay)

        results_per_split.append(results_all)
        # save predictions

    ts_name = []
    labels_true_final = []
    labels_pred_final = []
    timestamps_final = []

    #ts_no = 44

    for ts_no in range(0, len(ts_names)):
        labels_true = []
        labels_pred = []
        timestamps = []

        for i in range(0, len(results_per_split)):


            for j in range(0, len(results_per_split[i][ts_no][0][1])):
                labels_true.append(results_per_split[i][ts_no][0][1][j])

            for k in range(0, len(results_per_split[i][ts_no][0][2])):    
                labels_pred.append(results_per_split[i][ts_no][0][2][k])

            for l in range(0, len(results_per_split[i][ts_no][0][0])):
                timestamps.append(results_per_split[i][ts_no][0][0][l])

        ts_name.append(results_per_split[i][ts_no][0][3])
        labels_true_final.append(labels_true)
        labels_pred_final.append(labels_pred)
        timestamps_final.append(timestamps)
    
    list_results_final = []
    for i in range(0, len(labels_true_final)):
        list_results_intermediate = []
        list_results_intermediate.append(timestamps_final[i])
        list_results_intermediate.append(labels_true_final[i])
        list_results_intermediate.append(labels_pred_final[i])
        list_results_intermediate.append(ts_name[i])

        list_results_final.append(list_results_intermediate)
    list_results_intermediate
    
    
    

    f1_per_delay = []
    delay_list = []
    precision_per_delay = []
    recall_per_delay = []
    TP_per_delay = []
    FP_per_delay = []
    TN_per_delay = []
    FN_per_delay = []


    for delay in range(0,8):
        total_fscore, pre, rec, TP, FP, TN, FN = evaluate_for_all_series(list_results_final, delay)
        delay_list.append(delay)
        f1_per_delay.append(total_fscore)
        precision_per_delay.append(pre)
        recall_per_delay.append(rec)
        TP_per_delay.append(TP)
        FP_per_delay.append(FP)
        TN_per_delay.append(TN)
        FN_per_delay.append(FN)
        
    final_results.append(list_results_final)
    final_delay.append(delay_list)
    final_f1_score.append(f1_per_delay)
    final_precision.append(precision_per_delay)
    final_recall.append(recall_per_delay)
    

    final_TP.append(TP_per_delay)
    final_FP.append(FP_per_delay)
    final_TN.append(TN_per_delay)
    final_FN.append(FN_per_delay)


    # Create single row 
    
    df_results_final = pd.DataFrame(columns=['Seeds', 'Retraining_Technique', 'Delay', 'Results', 
                                        'F1_score', 'Precision', 'Recall', 'TP', 'FP', 'TN', 'FN'])




    df_results_final = pd.DataFrame(columns=['Seeds', 'Retraining_Technique', 'Delay', 'Results', 
                                        'F1_score', 'Precision', 'Recall', 'TP', 'FP', 'TN', 'FN'])
    df_results_final['Seeds'] = [seed]


    df_results_final['Retraining_Technique'] = [model_update]


    df_results_final['Delay'] = [delay_list]
    df_results_final['Results'] = [list_results_final]
    df_results_final['F1_score'] = [f1_per_delay]
    df_results_final['Precision'] = [precision_per_delay]
    df_results_final['Recall'] = [recall_per_delay]
    df_results_final['TP'] = [TP_per_delay]
    df_results_final['FP'] = [FP_per_delay]
    df_results_final['TN'] = [TN_per_delay]
    df_results_final['FN'] = [FN_per_delay]

    df_results_final = df_results_final.reset_index(drop = True)

    df_results_final_seeds = df_results_final_seeds.append(df_results_final)
    
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  0%|          | 0/67 [00:00<?, ?it/s]


KeyError: 'real_1.csv'

In [None]:
df_results_final_seeds = df_results_final_seeds.reset_index(drop = True)