In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.io import arff
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from skmultiflow.trees import HoeffdingTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from skmultiflow.drift_detection import DDM
from skmultiflow.drift_detection import EDDM
from skmultiflow.drift_detection import ADWIN
from skmultiflow.drift_detection import HDDM_A
from skmultiflow.drift_detection import HDDM_W

from datetime import timedelta, date

from tqdm import tqdm
from itertools import groupby
from sklearn.decomposition import PCA

import random
import seaborn as sns

from collections import Counter
from operator import itemgetter
import operator

import itertools
from scipy import stats
from scipy.stats import mannwhitneyu
from math import log2
from sklearn.metrics.pairwise import manhattan_distances
from scipy.spatial.distance import chebyshev
from scipy.spatial.distance import kulsinski
from scipy.spatial.distance import cosine
from scipy.spatial.distance import sqeuclidean

from sklearn.metrics import accuracy_score

import warnings
from IPython.display import clear_output

In [2]:
def drift_detect_function(drift_detector, y_true, y_pred, no_batches):
    batches_with_drift = []
    for i in range(0, no_batches):
        label_difference = abs(y_true[i] - y_pred[i])
        #print(label_difference)
        
        for j in range(0, len(label_difference)):
            drift_detector.add_element(list(label_difference)[j])

            if(drift_detector.detected_change()):
                batches_with_drift.append(i)
    return batches_with_drift

In [3]:
# ADWIN has: 1 for correct prediction and 0 for incorrect prediction
def drift_detect_function_ADWIN(drift_detector, y_true, y_pred, no_batches):
    
    batches_with_drift = []
    for i in range(0, no_batches):
        label_difference = abs(y_true[i] - y_pred[i])
        label_difference_final = 1 - label_difference
    
        for j in range(0, len(label_difference_final)):
            drift_detector.add_element(list(label_difference_final)[j])
            if(drift_detector.detected_change()):
                batches_with_drift.append(i)
    return batches_with_drift

In [4]:
def kl_divergence(p, q):
    return sum(p[0][i] * log2(p[0][i]/q[0][i]) for i in range(len(p[0])))

In [5]:
def bhattacharyya(p, q):
    return -np.log(sum(np.square(p[0][i]*1.0*q[0][i]) for i in range(len(p[0]))))

In [6]:
def bootstrapping(X_train, bootstrapping_samples, pca=None):
    # Extract distributions of 50 random training subsets
    distributions_bootstrapping = []
    for i in range(0, bootstrapping_samples):
        # generate random number between 0 and length_of_train-length_of_test in order to have a sample of size length_of_test
        rand = random.randint(0,len(X_train)-length_of_test)
        # extract the distribution from the training samples random number:random number + length_of_test by means of histogram
        if pca is not None:
            bootstrapping_input = pca.transform(X_train[rand:rand+length_of_test])
        else:
            bootstrapping_input = X_train[rand:rand+length_of_test]
        dist = sns.distplot(bootstrapping_input).get_lines()[0].get_data()[1]
        # store distribution
        distributions_bootstrapping.append(dist)
        plt.close()
    return distributions_bootstrapping

In [7]:
def kdqtrees_bootstrapping(distrib_X_train, distrib_X_test, distributions_bootstrapping, distance):
    
    # option to choose the similarity distance
    
    if(distance == 'kl_divergence'):
        similarity_metric = kl_divergence
    elif(distance == 'manhattan'):
        similarity_metric = manhattan_distances
    elif(distance == 'chebyshev'):
        similarity_metric = chebyshev
    elif(distance == 'kulsinski'):
        similarity_metric = kulsinski
    elif(distance == 'cosine'):
        similarity_metric = cosine
    elif(distance == 'squared_euclidean'):
        similarity_metric = sqeuclidean
    elif(distance == 'bhattacharyya'):
        similarity_metric = bhattacharyya
    
    # Bootstrapping technique to define critical region
    
    dist_bootstrapping = []
    
    # Calculate similarity distance between training set and each of the 50 training subsets
    for i in range(0, len(distributions_bootstrapping)):
        dist_bootstrapping.append(similarity_metric([distrib_X_train], [distributions_bootstrapping[i]]))
    
    
    # Define Critical Region
    critical_region = np.max(dist_bootstrapping)
    
    # Detect Drifts between train and test
    
    drift_batch = None
    
    # Calculate distance between train and test distributions
    
    similarity_metric_train_test = similarity_metric([distrib_X_train], [distrib_X_test])
    
    if(similarity_metric_train_test>critical_region):
        print(distance + " : Drift Detected" )
        return 1
    else:
        return 0

In [8]:
def compute_metric_false_positive(array_batches, drift_start):
        
    if(len(array_batches)>0): 
        
        if(len([x for x in array_batches if x<drift_start])>0):
            return(len([x for x in array_batches if x<drift_start])/drift_start)
        else:
            return 0
        
    else:
        return 'nothing_detected'

In [9]:
def ede_drift_detector(distrib_X_train, distrib_X_test, statistical_test):

    if(statistical_test == 'ks'):
        stat_test = stats.kstest
    elif(statistical_test == 'mw'):
        stat_test = stats.mannwhitneyu
    
    v, p = stat_test(distrib_X_train, distrib_X_test)
        
    
    if(p<=0.05):
        print("Reject Null Hypothesis according to KS statistical test")
        return 1
    else:
        return 0

In [10]:
def compute_metric_latency(array_batches, no_batches_with_drift, drift_start):
    
    if(len(array_batches)>0):  
        #print(np.array(array_pr)>=drift_type_start)
        #print(np.argwhere(np.array(array_pr)>=drift_type_start).size==0)
        
        if(np.argwhere(np.array(array_batches)>=drift_start).size==0):
            latency_score = 'nothing_detected' 
        else:
            batch_drift_detected = array_batches[np.argwhere(np.array(array_batches)>=drift_start)[0][0]]
            latency_score = (batch_drift_detected - drift_start)/no_batches_with_drift
        return latency_score
    else:
        return 'nothing_detected'

In [11]:
warnings.filterwarnings('ignore')

# Energy

In [12]:
data_energy = arff.loadarff('../elecNormNew.arff')
df_energy = pd.DataFrame(data_energy[0])


# Adding the real date column to the dataset

In [13]:
day_list = list(df_energy.day)
day_list_str = []
for i in range(0, len(day_list)):
    day_list_str.append(str(day_list[i]))
day_list_str_non_duplicates = [key for key, _group in groupby(day_list_str)]


In [14]:
prev_day = day_list_str_non_duplicates[0]


In [15]:
for i in range(1, len(day_list_str_non_duplicates)):
    prev_day = day_list_str_non_duplicates[i-1]
    
    current_day = day_list_str_non_duplicates[i]
    
    if(prev_day == 'b\'7\''):
        expected_current_day = 'b\'1\''
    else:
        expected_current_day = 'b\'' + str(int(prev_day.split('\'')[1])+1)+ '\''
    if(current_day!= expected_current_day):
        print('fjidjsfio')

In [16]:
def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

date_time_energy = []
start_date = date(1996, 5, 7)
end_date = date(1998, 12, 7)
for single_date in daterange(start_date, end_date):
    date_time_energy.append(single_date.strftime("%Y-%m-%d"))

In [17]:
date_energy_df = np.repeat(date_time_energy, 48)

In [18]:
df_energy['real_date'] = date_energy_df


### Label encoding

In [19]:
labelencoder = LabelEncoder()
df_energy['class'] = labelencoder.fit_transform(df_energy['class'])


training until the 15th of April.

In [20]:
training_samples_index = list(df_energy[df_energy.real_date == '1997-04-15'].index)[len(list(df_energy[df_energy.real_date == '1997-05-02'].index))-1]


## Split in Train and Test

In [21]:
train = df_energy[0:training_samples_index]


In [22]:
train_data = train.drop(['date', 'day', 'period', 'class', 'real_date'], axis = 1)
label_train_data = train['class']


# Train Models

In [23]:
gnb = GaussianNB()
naive_bayes = gnb.fit(train_data, label_train_data)

ht = HoeffdingTreeClassifier()
hoeffding_tree = ht.fit(np.array(train_data), np.array(label_train_data))

adb = AdaBoostClassifier(n_estimators=100, random_state=0)
adaboost = adb.fit(train_data, label_train_data)

xgb = XGBClassifier()
xgboost = xgb.fit(train_data, label_train_data)

lgb = LGBMClassifier()
lightgbm = lgb.fit(train_data, label_train_data)





# Test Models

Testing scenario: test model every week

In [24]:
test = df_energy[training_samples_index:]
test = test.reset_index(drop = True)


In [25]:
test_data = test.drop(['date', 'day', 'period', 'class', 'real_date'], axis = 1)
label_test_data = test['class']


In [26]:
dict_test = test.groupby(['real_date', 'day']).groups


In [27]:
dict_test_sorted = dict([(key , value) for (key, value) in sorted(dict_test.items())])


In [28]:
week_values = list(dict_test_sorted.values())


In [29]:
indexes_testing_batches = []
for i in range(0, len(week_values), 7):
    index = []
    for j in range(0, len(week_values[i:i+7])):
        for k in range(0, len(week_values[i:i+7][j])):
            index.append(week_values[i:i+7][j][k])
    indexes_testing_batches.append(index)


In [30]:
y_pred_naivebayes = []
y_pred_hoftree = []
y_pred_adaboost = []
y_pred_xgboost = []
y_pred_lightgbm = []

df_test_labels_total = []


for i in tqdm(range(0, len(indexes_testing_batches))):
    testing_batch = pd.DataFrame()
    label_batch = []
    
    index_current_testing_batch = indexes_testing_batches[i]
    
    testing_batch = test_data[(index_current_testing_batch[0]):(index_current_testing_batch[-1]+1)]
    label_batch = label_test_data[(index_current_testing_batch[0]):(index_current_testing_batch[-1]+1)]

    df_test_labels_total.append(list(label_batch))
    
    # Naive Bayes
    y_pred_nb = naive_bayes.predict(testing_batch)
    y_pred_naivebayes.append(y_pred_nb)
    
    
    # Hoeffding Trees
    y_pred_ht = hoeffding_tree.predict(np.array(testing_batch))
    y_pred_hoftree.append(y_pred_ht)
    
    # Adaboost
    y_pred_adb = adaboost.predict(testing_batch)
    y_pred_adaboost.append(y_pred_adb)
    
    # XGBoost
    y_pred_xgb = xgboost.predict(testing_batch)
    y_pred_xgboost.append(y_pred_xgb)
    
    # LightGBM
    y_pred_lgbm = lightgbm.predict(testing_batch)
    y_pred_lightgbm.append(y_pred_lgbm)

100%|██████████| 86/86 [00:03<00:00, 26.88it/s]


In [31]:
y_pred_dict = {'NaiveBayes': y_pred_naivebayes, 
                   'HoeffdingTree' : y_pred_hoftree, 
                   'Adaboost' : y_pred_adaboost,
                   'XGBoost' : y_pred_xgboost,
                   'LightGBM' : y_pred_lightgbm}

# Detect Drift

In [33]:
no_batches = len(indexes_testing_batches)


In [34]:
classifier_list = []
detector_list = []
batches_detected = []

for i in range(0, len(list(y_pred_dict.keys()))):
    classifier_list.append(list(y_pred_dict.keys())[i])
    
    detector_list.append('DDM')
    batches_detected.append(drift_detect_function(DDM(), df_test_labels_total, list(y_pred_dict.values())[i], no_batches))
    
    detector_list.append('EDDM')
    batches_detected.append(drift_detect_function(EDDM(), df_test_labels_total, list(y_pred_dict.values())[i], no_batches))
    
    detector_list.append('ADWIN')
    batches_detected.append(drift_detect_function(ADWIN(), df_test_labels_total, list(y_pred_dict.values())[i], no_batches))
    
    detector_list.append('HDDM_A')
    batches_detected.append(drift_detect_function(HDDM_A(), df_test_labels_total, list(y_pred_dict.values())[i], no_batches))
    
    detector_list.append('HDDM_W')
    batches_detected.append(drift_detect_function(HDDM_W(), df_test_labels_total, list(y_pred_dict.values())[i], no_batches))

batches_detected_clean = []
for i in range(0, len(batches_detected)):
    batches_detected_clean.append(list(dict.fromkeys(batches_detected[i])))

classifier_list = list(itertools.chain.from_iterable(itertools.repeat(x, 5) for x in classifier_list))



In [35]:
df_energy_eb_detectors = pd.DataFrame(list(zip(classifier_list, detector_list, batches_detected_clean)), columns =['Classifier', 'Detector', 'Batches_Detected'])
# df_energy_eb_detectors

# DDB Detectors

### Distribution of Reference Data

In [36]:
pca = PCA(n_components = 0.999)
pca.fit(train_data)

train_data_pca = pca.transform(train_data)

In [37]:
distribution_train = sns.distplot(train_data).get_lines()[0].get_data()[1]
plt.close()

distribution_train_pca = sns.distplot(train_data_pca).get_lines()[0].get_data()[1]
plt.close()



### Distribution Bootstrapping

In [38]:
# length_of_test should be around 15000 (best compromise between all testing sets)
length_of_test = 15000
distributions_bootstrapping = bootstrapping(train_data, bootstrapping_samples=50)

In [39]:
distributions_bootstrapping_pca = bootstrapping(train_data, bootstrapping_samples=50, pca=pca)

In [40]:
distributions_test = []
distributions_test_pca = []

for i in tqdm(range(0, len(indexes_testing_batches))):
    
    index_current_testing_batch = indexes_testing_batches[i]
    
    testing_batch = test_data[(index_current_testing_batch[0]):(index_current_testing_batch[-1]+1)]

    dist_test = sns.distplot(testing_batch).get_lines()[0].get_data()[1]
    plt.close()
    
    dist_test_pca = sns.distplot(pca.transform(testing_batch)).get_lines()[0].get_data()[1]
    plt.close()
    
    distributions_test.append(dist_test)
    distributions_test_pca.append(dist_test_pca)

100%|██████████| 86/86 [00:10<00:00,  8.55it/s]


## EDE

In [41]:
detected_drifts_ks = []
detected_drifts_mw = []
for i in range(0, len(distributions_test)):
    if(ede_drift_detector(distribution_train, distributions_test[i], 'ks')==1):
        detected_drifts_ks.append(i)
    if(ede_drift_detector(distribution_train, distributions_test[i], 'mw')==1):
        detected_drifts_mw.append(i)
    clear_output(wait=True)

test = ['ks', 'mw']
drifts = [detected_drifts_ks, detected_drifts_mw]


Reject Null Hypothesis according to KS statistical test
Reject Null Hypothesis according to KS statistical test


In [42]:
df_ede_bootstrapping = pd.DataFrame({'detector' : 'ede',
                                    'distance/test': test, 
                                    'Batches_Detected': drifts})
# df_ede_bootstrapping

## kdqTrees

In [43]:
kl_drift = []
mh_drift = []
cbs_drift = []
ksnk_drift = []
csn_drift = []
sqe_drift = []
bct_drift = []

for i in range(0, len(distributions_test)):
    if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'kl_divergence') == 1):
        kl_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'manhattan') == 1):
        mh_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'chebyshev') == 1):
        cbs_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'kulsinski') == 1):
        ksnk_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'cosine') == 1):
        csn_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'squared_euclidean') == 1):
        sqe_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'bhattacharyya') == 1):
        bct_drift.append(i)
    clear_output(wait=True)

distances = ['kl_div', 'manhattan', 'chebysev', 'kulsinski', 'cosine', 'sq_euclid', 'batthacrya']
drifts = [kl_drift, mh_drift, cbs_drift, ksnk_drift, csn_drift, sqe_drift, bct_drift]
df_kdqtrees_bootstrapping = pd.DataFrame({'detector' : 'kdqTrees',
                                        'distance/test': distances, 
                                        'Batches_Detected': drifts
                                })


kl_divergence : Drift Detected
manhattan : Drift Detected
chebyshev : Drift Detected
cosine : Drift Detected
squared_euclidean : Drift Detected
bhattacharyya : Drift Detected


In [44]:
# df_kdqtrees_bootstrapping

## PCA-kdq

In [45]:
kl_drift = []
mh_drift = []
cbs_drift = []
ksnk_drift = []
csn_drift = []
sqe_drift = []
bct_drift = []

for i in range(0, len(distributions_test_pca)):

    if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'kl_divergence') == 1):
        kl_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'manhattan') == 1):
        mh_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'chebyshev') == 1):
        cbs_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'kulsinski') == 1):
        ksnk_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'cosine') == 1):
        csn_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'squared_euclidean') == 1):
        sqe_drift.append(i)
    if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'bhattacharyya') == 1):
        bct_drift.append(i)
    clear_output(wait=False)

distances = ['kl_div', 'manhattan', 'chebysev', 'kulsinski', 'cosine', 'sq_euclid', 'batthacrya']
drifts = [kl_drift, mh_drift, cbs_drift, ksnk_drift, csn_drift, sqe_drift, bct_drift]
df_pca_bootstrapping = pd.DataFrame({'detector' : 'pca',
                                     'distance/test': distances, 
                                     'Batches_Detected': drifts
                                })

merged_results = pd.concat([df_ede_bootstrapping, df_kdqtrees_bootstrapping, df_pca_bootstrapping])
merged_results = merged_results.reset_index(drop=True)




In [46]:
# df_pca_bootstrapping

In [47]:
df_energy_ddb_detectors = pd.concat([df_ede_bootstrapping, df_kdqtrees_bootstrapping, df_pca_bootstrapping])
#df_energy_ddb_detectors

# Calculate metrics

In [48]:
no_batches_with_drift = 84 # there are 84 batches with drift because 3 feature start changing their behaviour after the 2nd of May 1997
drift_start = 2 # drift starts at batch 2, which is the week including the 2nd of May 1997 (30.04.1997-06.05.1997) 

In [49]:
latency_erb = []
fpr_erb = []

latency_ddb = []
fpr_ddb = []

for i in range(0, len(list(df_energy_eb_detectors.Batches_Detected))):
    latency_erb.append(compute_metric_latency(list(df_energy_eb_detectors.Batches_Detected)[i], no_batches_with_drift, drift_start))
    fpr_erb.append(compute_metric_false_positive(list(df_energy_eb_detectors.Batches_Detected)[i], drift_start))

for i in range(0, len(list(df_energy_ddb_detectors.Batches_Detected))):
    latency_ddb.append(compute_metric_latency(list(df_energy_ddb_detectors.Batches_Detected)[i], no_batches_with_drift, drift_start))
    fpr_ddb.append(compute_metric_false_positive(list(df_energy_ddb_detectors.Batches_Detected)[i], drift_start))

In [50]:
df_energy_eb_detectors['latency'] = latency_erb
df_energy_eb_detectors['fpr'] = fpr_erb
df_energy_eb_detectors

Unnamed: 0,Classifier,Detector,Batches_Detected,latency,fpr
0,NaiveBayes,DDM,"[2, 4, 7, 9, 10, 11, 12, 15, 18, 20, 21, 22, 2...",0.0,0.0
1,NaiveBayes,EDDM,"[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 1...",0.0,1.0
2,NaiveBayes,ADWIN,"[2, 3, 4, 8, 9, 11, 12, 14, 18, 21, 22, 23, 36...",0.0,0.0
3,NaiveBayes,HDDM_A,"[0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 1...",0.0,1.0
4,NaiveBayes,HDDM_W,"[0, 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 15, 16, 1...",0.0,1.0
5,HoeffdingTree,DDM,"[2, 5, 7, 8, 9, 10, 12, 15, 16, 17, 18, 20, 22...",0.0,0.0
6,HoeffdingTree,EDDM,"[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 1...",0.0,1.0
7,HoeffdingTree,ADWIN,"[2, 3, 4, 6, 8, 9, 14, 18, 19, 23, 25, 34, 48,...",0.0,0.0
8,HoeffdingTree,HDDM_A,"[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15...",0.0,1.0
9,HoeffdingTree,HDDM_W,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15,...",0.0,1.0


In [51]:
df_energy_ddb_detectors['latency'] = latency_ddb
df_energy_ddb_detectors['fpr'] = fpr_ddb
df_energy_ddb_detectors

Unnamed: 0,detector,distance/test,Batches_Detected,latency,fpr
0,ede,ks,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.0
1,ede,mw,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.0
0,kdqTrees,kl_div,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.0
1,kdqTrees,manhattan,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.0
2,kdqTrees,chebysev,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.0
3,kdqTrees,kulsinski,[],nothing_detected,nothing_detected
4,kdqTrees,cosine,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.0
5,kdqTrees,sq_euclid,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.0
6,kdqTrees,batthacrya,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",0.0,1.0
0,pca,kl_div,"[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 1...",0.0,1.0
