In [1]:
import pandas as pd
import os
from statsmodels.tsa.stattools import acf, pacf
import numpy as np
from scipy.spatial.distance import cosine
from scipy import stats
from tqdm import tqdm
import math

In [2]:
def get_path(dataset_path, t):
    files_path = []
    if t == 'train' or t == 'test':
        dir_data = os.getcwd() + '/' + dataset_path + '/' + t
        files = os.listdir(dir_data)
        files_path += [dir_data + '/' + _ for _ in files]
    else:
        print('Invalid option')
    return files_path

In [3]:
def turningpoints(lst):
        dx = np.diff(lst)
        return np.sum(dx[1:] * dx[:-1] < 0)

In [4]:
def autocorrelation_f1(time_series):
    autocorrelation_ts = acf(time_series, nlags=5)
    return autocorrelation_ts

In [5]:
def partial_autocorrelation_f2(time_series):
    partial_autocorrelation_ts = pacf(time_series, nlags=5)
    return partial_autocorrelation_ts

In [6]:
def variance_f3(time_series):
    variance_ts = time_series.std()
    return variance_ts

In [7]:
def skewness_f4(time_series):
    variance_ts = time_series.skew()
    return variance_ts

In [8]:
def kurtosis_f5(time_series):
    kurtosis_ts = time_series.kurtosis()
    return kurtosis_ts

In [9]:
def turning_points_f6(time_series):
    turning_points_ts = turningpoints(time_series)
    return turning_points_ts

In [10]:
def compute_features(time_series):
    time_series_diff = time_series - time_series.shift()
    time_series_diff = time_series_diff[1:]
    
    features = []

    # autocorrelation F1
    autocorrelation_ts = autocorrelation_f1(time_series_diff)
    # partial autocorrelation F2
    partial_autocorrelation_ts = partial_autocorrelation_f2(time_series_diff)
    # variance F3
    variance_ts = variance_f3(time_series_diff)
    # skewness F4
    skewness_ts = skewness_f4(time_series_diff)
    # Kurtoisis F5
    kurtoisis_ts = kurtosis_f5(time_series_diff)
    # Turning Point F6
    turning_point_ts = turning_points_f6(time_series_diff)

    for i in autocorrelation_ts:
        features.append(i)

    for i in partial_autocorrelation_ts:
        features.append(i)

    features.append(variance_ts)

    features.append(skewness_ts)

    features.append(kurtoisis_ts)

    features.append(turning_point_ts)
    
    return features

In [11]:
def compute_distance(vetor1, vetor2, dist):
        '''
        Compute pearson correlation or cosine distance
        :param vetor1: first vector
        :param vetor2: second vector
        :return: distance
        '''  
        
        if(dist == 'cosine'):
            distance = cosine(vetor1, vetor2)
        elif(dist == 'pearsonr'):
            distance = stats.pearsonr(vetor1, vetor2)[0] # [0] to get distance [1] to get p-value
        else:
            print('Invalid Distance')
        
        return distance

In [12]:
def calculate_m0_std0(time_series, dist):
    final = len(time_series)
    qtd = 3
    
    vector_0 = time_series[:final-qtd]
    vector_features_0 = compute_features(vector_0)
    
    
    distance_vector = []
    for i in range(1, qtd):
        
        
        vector_1 = ts_train[i:final-qtd+i]
        
        vector_features = compute_features(vector_1)

        if(dist == 'cosine'):
            distance = compute_distance(vector_features_0, vector_features, 'cosine')

    
        if(dist == 'pearsonr'):
            distance = compute_distance(vector_features_0, vector_features, 'pearsonr')
        
        distance_vector.append(distance)

    
    return np.mean(distance_vector), np.std(distance_vector)

In [13]:
def update_ewma(value, t, Lambda, m0, std0):
    '''
    method to update ewma with current error
    :param error: double with the error to be checked # Luis??
    :param t: instant of time
    '''
    
    if(Lambda <= 0.1 and Lambda>=0.3):
        print('Please choose a value for Lambda between [0.1, 0.3]')
    else:
        change = False

        zt = 0
        avg_zero = m0
        std_zero = std0

        # Calculating the moving average
        if(t == 1):
            zt = (1-Lambda) * avg_zero + Lambda * value
        elif(change == True):
            change = False
            zt = (1-Lambda) * avg_zero + Lambda * value
        else:
            zt = (1-Lambda) * zt + Lambda * value


        # calculating the deviation of the moving average
        part1 = (Lambda/(2-Lambda))
        part2 = (1-Lambda)
        part3 = (2*t)
        part4 = (1 - (part2**part3))
        part5 = (part1 * part4 * std_zero)
        deviation_zero = np.sqrt(part5)

        return zt, std_zero

In [14]:
def monitor(zt, deviation_zero, control_param, warning_param):
        '''
        Method to check the FEED detection condition
        '''
        avg_zero = 0

        deviation_zero = 0
        change_detection = 0 # No Change
        #consulting the rules
        if(zt > avg_zero + (control_param * deviation_zero)):
            change = True
            change_detection = 1 # Concept Drift

        elif(zt > avg_zero + (warning_param * deviation_zero)):

            change_detection = 2 # Warning
            return change_detection
        
        return change_detection

In [15]:
data_train_path = '../../../Documents/phd_related/srcnn/anomalydetector-master/data_yahoo/'
data_test_path = '../../../Documents/phd_related/srcnn/anomalydetector-master/data_yahoo'

In [16]:
files_train = get_path(data_train_path, 'train')
files_test = get_path(data_train_path, 'test')

In [17]:
data_train = pd.read_csv(files_train[20])
data_train = data_train.loc[:, ~data_train.columns.str.match('Unnamed')]
data_train

Unnamed: 0,timestamp,value,is_anomaly
0,1,159,0
1,2,137,0
2,3,141,0
3,4,158,0
4,5,143,0
...,...,...,...
714,715,113,0
715,716,105,0
716,717,80,0
717,718,92,0


In [18]:
data_test = pd.read_csv(files_test[20])
data_test = data_test.loc[:, ~data_test.columns.str.match('Unnamed')]
data_test

Unnamed: 0,timestamp,value,is_anomaly
0,720,121,0
1,721,74,0
2,722,78,0
3,723,63,0
4,724,71,0
...,...,...,...
715,1435,256,1
716,1436,275,1
717,1437,272,1
718,1438,421,1


In [19]:
ts_train = data_train.value
ts_test = data_test.value

# FEDD

## Computing Features

#### Autocorrelation

### Features Train

In [20]:
features_train = compute_features(ts_train)
features_train

[1.0,
 -0.4508490512500811,
 0.03765380479231339,
 -0.08003055959817737,
 0.018525726228448957,
 0.04369271858986207,
 1.0,
 -0.4514778504847395,
 -0.20859079804665925,
 -0.2021429783006131,
 -0.14763412453190297,
 -0.04149795950525788,
 17.279874103141136,
 0.43762280140622684,
 1.0922280708821335,
 526]

### Features Test

In [21]:
len(ts_test)

720

In [22]:
window = 168

In [23]:
features_test_batches = []
for i in tqdm(range(0, (math.floor(len(ts_test)/window)+1))):
    if(i == (round(len(ts_test)/window))):
        ts_test_current = ts_test[(i)*window:]
    else:
        ts_test_current = ts_test[(i*window):((i+1)*window)]
    
    features_test_batches.append(compute_features(ts_test_current))
features_test_batches

  time_series_diff = time_series_diff[1:]
100%|██████████| 5/5 [00:00<00:00, 333.76it/s]


[[1.0,
  -0.2779431837642242,
  -0.16757854807235545,
  0.12897466757595005,
  -0.028002309492611226,
  -0.12331554679736849,
  1.0,
  -0.2796175402929244,
  -0.2688131918842712,
  -0.004683682244222068,
  -0.03404409082325436,
  -0.1322633990651524,
  19.428238740885142,
  -0.3984718902843558,
  1.0007741618327235,
  112],
 [1.0,
  -0.36220060984002583,
  -0.1793584141393113,
  0.10699591002763914,
  0.05472391857194537,
  -0.08695115196099001,
  1.0,
  -0.36438254122460434,
  -0.3624283894892112,
  -0.15031008970547974,
  -0.024470552332464906,
  -0.06578391440517065,
  14.470496941214742,
  -0.07334234734900558,
  0.14803918515888626,
  119],
 [1.0,
  -0.4098171011514344,
  -0.028312101635483283,
  -0.015070725737696331,
  -0.009330431548205217,
  0.02080196282941207,
  1.0,
  -0.4122858788692141,
  -0.23931331751367318,
  -0.16441653030193196,
  -0.1248549269273064,
  -0.06371259808729893,
  14.6204808055564,
  -0.09934134899302789,
  0.12257184798491805,
  122],
 [1.0,
  -0.301550

## Compute Distance

In [24]:
distances_cosine = []
distances_pearson = []

for i in range(0, len(features_test_batches)):
    distances_cosine.append(compute_distance(features_train, features_test_batches[i], 'cosine'))
    distances_pearson.append(compute_distance(features_train, features_test_batches[i], 'pearsonr'))

In [25]:
distances_cosine

[0.009719647782615803,
 0.003937570489907172,
 0.003781657247682757,
 0.005895059506090372,
 0.3282347436260915]

In [26]:
distances_pearson

[0.9901188356896876,
 0.9960009574986615,
 0.9961569241115142,
 0.9940386727657093,
 0.6423225858044017]

## Change Detection

In [30]:
control = 0.5
warning = 0.05
Lambda = 0.2
m0, std0 = calculate_m0_std0(ts_train, 'cosine')


for i in range(0, len(distances_pearson)):
    ztt, dev_zero = update_ewma(distances_pearson[i], i, Lambda, m0, std0)

    print(ztt)
    print('deviation', dev_zero)
    print(monitor(ztt, dev_zero, control, warning))

0.19802376713793754
deviation 1.4570694650828386e-10
1
0.19920019170124956
deviation 1.4570694650828386e-10
1
0.19923138482230285
deviation 1.4570694650828386e-10
1
0.19880773455314188
deviation 1.4570694650828386e-10
1
0.12846451716088034
deviation 1.4570694650828386e-10
1


  time_series_diff = time_series_diff[1:]
