In [4]:
import pandas as pd
import os
from statsmodels.tsa.stattools import acf, pacf
import numpy as np
from scipy.spatial.distance import cosine
from scipy import stats
from tqdm import tqdm
import math
from sklearn.feature_selection import mutual_info_regression
import statsmodels.api as sm

In [28]:
def get_path(dataset_path, t):
    files_path = []
    if t == 'data_train' or t == 'data_test':
        dir_data = os.getcwd() + '/' + dataset_path + '/' + t
        files = os.listdir(dir_data)
        files_path += [dir_data + '/' + f for f in files if not str(f).endswith('.gitkeep')]
    else:
        print('Invalid option')
    return files_path

In [6]:
def turningpoints(lst):
        dx = np.diff(lst)
        return np.sum(dx[1:] * dx[:-1] < 0)

In [7]:
def autocorrelation_f1(time_series):
    autocorrelation_ts = acf(time_series, nlags=5)
    return autocorrelation_ts

In [8]:
def partial_autocorrelation_f2(time_series):
    partial_autocorrelation_ts = pacf(time_series, nlags=5)
    return partial_autocorrelation_ts

In [9]:
def variance_f3(time_series):
    variance_ts = time_series.std()
    return variance_ts

In [10]:
def skewness_f4(time_series):
    variance_ts = time_series.skew()
    return variance_ts

In [11]:
def kurtosis_f5(time_series):
    kurtosis_ts = time_series.kurtosis()
    return kurtosis_ts

In [12]:
def turning_points_f6(time_series):
    turning_points_ts = turningpoints(time_series)
    return turning_points_ts

In [13]:
# The bicorrelations at the first three lags were used
def three_point_autocorrelation_f7(time_series, lag_delay=3):
    three_point_autocorrelation = acorr = sm.tsa.acf(time_series, nlags = lag_delay)
    return three_point_autocorrelation[:-1]

In [14]:
# The mutual information at the first three lags were used
def mutual_info_f8(time_series, lag_delay=3):
    mutual_info_ts = []
    for _ in range(0, lag_delay+1):
        lagged_time_series = time_series[:-lag_delay]
        time_series_ = time_series[lag_delay:].to_numpy().reshape(-1, 1)
        mutual_info = mutual_info_regression(time_series_, lagged_time_series)
        mutual_info_ts.append(mutual_info[0])
    return mutual_info_ts

In [15]:
def compute_features(time_series):
    time_series_diff = time_series - time_series.shift()
    time_series_diff = time_series_diff[1:]
    
    features = []

    # autocorrelation F1
    autocorrelation_ts = autocorrelation_f1(time_series_diff)
    # partial autocorrelation F2
    partial_autocorrelation_ts = partial_autocorrelation_f2(time_series_diff)
    # variance F3
    variance_ts = variance_f3(time_series_diff)
    # skewness F4
    skewness_ts = skewness_f4(time_series_diff)
    # Kurtoisis F5
    kurtoisis_ts = kurtosis_f5(time_series_diff)
    # Turning Point F6
    turning_point_ts = turning_points_f6(time_series_diff)
    # Three point autocorrelation F7
    three_point_autocorrelation_ts = three_point_autocorrelation_f7(time_series_diff)
    # Mutual info F8
    mutual_info_ts = mutual_info_f8(time_series_diff)

    for i in autocorrelation_ts:
        features.append(i)

    for i in partial_autocorrelation_ts:
        features.append(i)

    features.append(variance_ts)

    features.append(skewness_ts)

    features.append(kurtoisis_ts)

    features.append(turning_point_ts)
    
    for i in three_point_autocorrelation_ts:
        features.append(i)

    for i in mutual_info_ts:
        features.append(i)
    
    return features

In [16]:
def compute_distance(vetor1, vetor2, dist):
        '''
        Compute pearson correlation or cosine distance
        :param vetor1: first vector
        :param vetor2: second vector
        :return: distance
        '''  
        
        if(dist == 'cosine'):
            distance = cosine(vetor1, vetor2)
        elif(dist == 'pearsonr'):
            distance = stats.pearsonr(vetor1, vetor2)[0] # [0] to get distance [1] to get p-value
        else:
            print('Invalid Distance')
        
        return distance

In [17]:
def calculate_m0_std0(time_series, dist):
    final = len(time_series)
    qtd = 3
    
    vector_0 = time_series[:final-qtd]
    vector_features_0 = compute_features(vector_0)
    
    
    distance_vector = []
    for i in range(1, qtd):
        
        
        vector_1 = ts_train[i:final-qtd+i]
        
        vector_features = compute_features(vector_1)

        if(dist == 'cosine'):
            distance = compute_distance(vector_features_0, vector_features, 'cosine')

    
        if(dist == 'pearsonr'):
            distance = compute_distance(vector_features_0, vector_features, 'pearsonr')
        
        distance_vector.append(distance)

    
    return np.mean(distance_vector), np.std(distance_vector)

In [18]:
def update_ewma(value, t, Lambda, m0, std0):
    '''
    method to update ewma with current error
    :param error: double with the error to be checked # Luis??
    :param t: instant of time
    '''
    
    if(Lambda <= 0.1 and Lambda>=0.3):
        print('Please choose a value for Lambda between [0.1, 0.3]')
    else:
        change = False

        zt = 0
        avg_zero = m0
        std_zero = std0

        # Calculating the moving average
        if(t == 1):
            zt = (1-Lambda) * avg_zero + Lambda * value
        elif(change == True):
            change = False
            zt = (1-Lambda) * avg_zero + Lambda * value
        else:
            zt = (1-Lambda) * zt + Lambda * value


        # calculating the deviation of the moving average
        part1 = (Lambda/(2-Lambda))
        part2 = (1-Lambda)
        part3 = (2*t)
        part4 = (1 - (part2**part3))
        part5 = (part1 * part4 * std_zero)
        deviation_zero = np.sqrt(part5)

        return zt, std_zero

In [19]:
def monitor(zt, deviation_zero, control_param, warning_param):
        '''
        Method to check the FEED detection condition
        '''
        avg_zero = 0

        deviation_zero = 0
        change_detection = 0 # No Change
        #consulting the rules
        if(zt > avg_zero + (control_param * deviation_zero)):
            change = True
            change_detection = 1 # Concept Drift

        elif(zt > avg_zero + (warning_param * deviation_zero)):

            change_detection = 2 # Warning
            return change_detection
        
        return change_detection

In [29]:
data_train_path = '../../datasets/data_yahoo/'
data_test_path = '../../datasets/data_yahoo/'

In [30]:
files_train = get_path(data_train_path, 'data_train')
files_test = get_path(data_train_path, 'data_test')

In [31]:
data_train = pd.read_csv(files_train[20])
data_train = data_train.loc[:, ~data_train.columns.str.match('Unnamed')]
data_train

IndexError: list index out of range

In [18]:
data_test = pd.read_csv(files_test[20])
data_test = data_test.loc[:, ~data_test.columns.str.match('Unnamed')]
data_test

Unnamed: 0,timestamp,value,is_anomaly
0,720,121,0
1,721,74,0
2,722,78,0
3,723,63,0
4,724,71,0
...,...,...,...
715,1435,256,1
716,1436,275,1
717,1437,272,1
718,1438,421,1


In [29]:
ts_train = data_train.value
ts_test = data_test.value

# FEDD

## Computing Features

#### Autocorrelation

### Features Train

In [46]:
features_train = compute_features(ts_train)
features_train

[1.0,
 0.27162473363871364,
 0.20206110284203202,
 0.11094938675977972,
 -0.006305672797858372,
 0.04928468726393885,
 1.0,
 0.2718109054917901,
 0.1387047815366957,
 0.028742736348999997,
 -0.0731902695367786,
 0.05078818723600917,
 0.2971387554795686,
 4.348503064678562,
 128.8397159007936,
 686,
 1.0,
 0.27162473363871364,
 0.20206110284203202,
 0.23484200153576262,
 0.23413241448968192,
 0.23451705656085142,
 0.23446171981579056]

### Features Test

In [47]:
len(ts_test)

NameError: name 'ts_test' is not defined

In [48]:
window = 168

In [49]:
features_test_batches = []
for i in tqdm(range(0, (math.floor(len(ts_test)/window)+1))):
    if(i == (round(len(ts_test)/window))):
        ts_test_current = ts_test[(i)*window:]
    else:
        ts_test_current = ts_test[(i*window):((i+1)*window)]
    
    features_test_batches.append(compute_features(ts_test_current))
features_test_batches

NameError: name 'ts_test' is not defined

## Compute Distance

In [50]:
distances_cosine = []
distances_pearson = []

for i in range(0, len(features_test_batches)):
    distances_cosine.append(compute_distance(features_train, features_test_batches[i], 'cosine'))
    distances_pearson.append(compute_distance(features_train, features_test_batches[i], 'pearsonr'))

In [51]:
distances_cosine

[]

In [52]:
distances_pearson

[]

## Change Detection

In [54]:
control = 0.5
warning = 0.05
Lambda = 0.2
m0, std0 = calculate_m0_std0(ts_train, 'cosine')


for i in range(0, len(distances_pearson)):
    ztt, dev_zero = update_ewma(distances_pearson[i], i, Lambda, m0, std0)

    print(ztt)
    print('deviation', dev_zero)
    print(monitor(ztt, dev_zero, control, warning))