In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
from scipy.io import arff
import random
import pandas as pd
import numpy as np
from sklearn import tree
from tqdm import tqdm
from math import log2
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import manhattan_distances
from scipy.spatial.distance import chebyshev
from scipy.spatial.distance import kulsinski
from scipy.spatial.distance import cosine
from scipy.spatial.distance import sqeuclidean
from scipy import stats
from scipy.stats import mannwhitneyu
from skmultiflow.drift_detection import HDDM_W
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import itertools
import datetime
import warnings
from IPython.display import clear_output

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
length_of_train = 30000
length_of_test = 10000

# Functions

In [None]:
def create_train(dataset, length_of_train):
    X_train = df.iloc[0:length_of_train]
    X_train = X_train.drop(['class'], axis = 1)
    y_train = df['class'][0:length_of_train]
    return X_train, y_train

In [None]:
def create_test(dataset, start_from_sample, length_of_test):
    X_test = df.iloc[start_from_sample:start_from_sample+length_of_test]
    X_test = X_test.drop(['class'], axis = 1)
    y_test = df['class'][start_from_sample:start_from_sample+length_of_test]
    return X_test, y_test

In [None]:
def kl_divergence(p, q):
    return sum(p[0][i] * log2(p[0][i]/q[0][i]) for i in range(len(p[0])))

In [None]:
def bhattacharyya(p, q):
    return -np.log(sum(np.square(p[0][i]*1.0*q[0][i]) for i in range(len(p[0]))))

Bootstrapping should have as input:
- training set.

In [None]:
def bootstrapping(X_train, bootstrapping_samples, pca=None):
    # Extract distributions of 50 random training subsets
    distributions_bootstrapping = []
    for i in tqdm(range(0, bootstrapping_samples)):
        # generate random number between 0 and length_of_train-length_of_test in order to have a sample of size length_of_test
        rand = random.randint(0,length_of_train-length_of_test)
        # extract the distribution from the training samples random number:random number + length_of_test by means of histogram
        if pca is not None:
            bootstrapping_input = pca.transform(X_train[rand:rand+length_of_test])
        else:
            bootstrapping_input = X_train[rand:rand+length_of_test]
        dist = sns.distplot(bootstrapping_input).get_lines()[0].get_data()[1]
        # store distribution
        distributions_bootstrapping.append(dist)
        plt.close()
    return distributions_bootstrapping

In [None]:
def kdqtrees_bootstrapping(distrib_X_train, distrib_X_test, distributions_bootstrapping, distance):
    
    # option to choose the similarity distance
    
    if(distance == 'kl_divergence'):
        similarity_metric = kl_divergence
    elif(distance == 'manhattan'):
        similarity_metric = manhattan_distances
    elif(distance == 'chebyshev'):
        similarity_metric = chebyshev
    elif(distance == 'kulsinski'):
        similarity_metric = kulsinski
    elif(distance == 'cosine'):
        similarity_metric = cosine
    elif(distance == 'squared_euclidean'):
        similarity_metric = sqeuclidean
    elif(distance == 'bhattacharyya'):
        similarity_metric = bhattacharyya
    
    # Bootstrapping technique to define critical region
    
    dist_bootstrapping = []
    
    # Calculate similarity distance between training set and each of the 50 training subsets
    for i in range(0, len(distributions_bootstrapping)):
        dist_bootstrapping.append(similarity_metric([distrib_X_train], [distributions_bootstrapping[i]]))
    
    
    # Define Critical Region
    critical_region = np.max(dist_bootstrapping)
    
    # Detect Drifts between train and test
    
    drift_batch = None
    
    # Calculate distance between train and test distributions
    
    similarity_metric_train_test = similarity_metric([distrib_X_train], [distrib_X_test])
    
    if(similarity_metric_train_test>critical_region):
        print(distance + " : Drift Detected" )
        return 1
    else:
        return 0

In [None]:
def ede_drift_detector(distrib_X_train, distrib_X_test, statistical_test):

    if(statistical_test == 'ks'):
        stat_test = stats.kstest
    elif(statistical_test == 'mw'):
        stat_test = stats.mannwhitneyu
    
    v, p = stat_test(distrib_X_train, distrib_X_test)
        
    
    if(p<=0.05):
        print("Reject Null Hypothesis according to KS statistical test")
        return 1
    else:
        return 0

In [None]:
def extract_times_detection(array):
    results = []

    
    if(len(array)==0):
        results = 'nothing_detected'
    else:
        for i in range(0, len(array)):
            results.append(int(array[i]))
    return results

In [None]:
def compute_metric_false_positive(array_batches):
    
    drift_start = 2
    
    if(len(array_batches)>0): 
        
        if(len([x for x in array_batches if x<drift_start])>0):
            return(len([x for x in array_batches if x<drift_start])/drift_start)
        else:
            return 0
        
    else:
        return 'nothing_detected'

In [None]:
def compute_metric_latency(array_batches):
    no_batches_with_drift = 5
    drift_start = 2
    
    if(len(array_batches)>0):  
        #print(np.array(array_pr)>=drift_type_start)
        #print(np.argwhere(np.array(array_pr)>=drift_type_start).size==0)
        
        if(np.argwhere(np.array(array_batches)>=drift_start).size==0):
            latency_score = 'nothing_detected' 
        else:
            batch_drift_detected = array_batches[np.argwhere(np.array(array_batches)>=drift_start)[0][0]]
            latency_score = (batch_drift_detected - drift_start)/no_batches_with_drift
        return latency_score
    else:
        return 'nothing_detected'

In [None]:
datasets = ['sea', 'agraw1', 'agraw2']
drift_types = ['abrupt', 'gradual']
random_seeds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
driftwidth_values = ['05', '1', '5', '10', '20']
noise_values = [0, 10, 20]
balance_types = ['imbalanced', 'balanced']


final_results_df = pd.DataFrame(columns=['dataset', 'drift_type', 'drift_width', 'detector', 'distance/test', 
                                         'drift_batches','noise_value', 'balance_type', 'false_positive_rate', 
                                         'latency', 'random_seed'])


for data_name in datasets:
    for noise_value in noise_values:
        for balance_type in balance_types:
            for random_seed in random_seeds:
                for drift_type in drift_types:
                        for driftwidth_value in driftwidth_values:

                            # to only go through abrupt once
                            if drift_type == 'abrupt' and driftwidth_values.index(driftwidth_value) > 0:
                                break

                            path = '../../../../Documents/phd_related/data_sets_concept_drift/moa_datasets/'
                            dataset_name = data_name


                            drift_width = '' if drift_type != 'gradual' else f'_{driftwidth_value}'
                            drift_width_column = drift_width if drift_type == 'abrupt' else driftwidth_value

                            dataset_path = path + f'{dataset_name}_{random_seed}_{drift_type}_drift_{noise_value}_noise_{balance_type}{drift_width}.arff' 

                            print(dataset_path)
                            # count how much it takes to run
                            now = datetime.datetime.now()
                            print('hour '+ str(int(now.hour)) + ' minute ' +  str(int(now.minute)) +' second ' + str(int(now.second)))


                            # read data 
                            data = arff.loadarff(dataset_path)

                            df = pd.DataFrame(data[0])
                            df = df.replace(df['class'].unique()[0], 0)
                            df = df.replace(df['class'].unique()[1], 1)

                            # extract labels

                            labels = list(df['class'])

                            # one hot encoding if needed

                            if(dataset_name.startswith('agraw')):
                                one_hot_encoded_data = pd.get_dummies(df, columns = ['elevel', 'car', 'zipcode'])
                                df = one_hot_encoded_data

                            # scale if needed (only for agraw 1 & 2)

                            if(dataset_name.startswith('agraw')):

                                labels = df['class']
                                scaler = MinMaxScaler(feature_range = (0,1))
                                scaler.fit(df.drop(['class'], axis=1))
                                df_scale = scaler.transform(df.drop(['class'], axis=1))
                                df = pd.DataFrame(df_scale)
                                df['class'] = labels




                            # split train test

                            X_train, y_train = create_train(df, length_of_train)

                            # PCA

                            pca = PCA(n_components = 0.999)
                            pca.fit(X_train)

                            X_train_pca = pca.transform(X_train)

                            # extract distributions train/pca train

                            distribution_train = sns.distplot(X_train).get_lines()[0].get_data()[1]
                            plt.close()

                            distribution_train_pca = sns.distplot(X_train_pca).get_lines()[0].get_data()[1]
                            plt.close()

                            # bootstrapping



                            distributions_bootstrapping = bootstrapping(X_train, bootstrapping_samples=50)

                            distributions_bootstrapping_pca = bootstrapping(X_train, bootstrapping_samples=50, pca=pca)

                            # extract distributions testing

                            distributions_test = []
                            distributions_test_pca = []

                            start_from_sample = length_of_train
                            for i in tqdm(range(0, int((len(df)-length_of_train)/length_of_test))):
                                X_test, y_test = create_test(df, start_from_sample, length_of_test)
                                start_from_sample = start_from_sample + length_of_test

                                dist_test = sns.distplot(X_test).get_lines()[0].get_data()[1]
                                plt.close()

                                X_test_pca = pca.transform(X_test)
                                dist_test_pca = sns.distplot(X_test_pca).get_lines()[0].get_data()[1]
                                plt.close()

                                distributions_test.append(dist_test)
                                distributions_test_pca.append(dist_test_pca)

                                #print(len(distributions_test))
                                #print(len(distributions_test_pca))



                            # EDE

                            detected_drifts_ks = []
                            detected_drifts_mw = []
                            for i in range(0, len(distributions_test)):
                                if(ede_drift_detector(distribution_train, distributions_test[i], 'ks')==1):
                                    detected_drifts_ks.append(i)
                                if(ede_drift_detector(distribution_train, distributions_test[i], 'mw')==1):
                                    detected_drifts_mw.append(i)

                            test = ['ks', 'mw']
                            drifts = [detected_drifts_ks, detected_drifts_mw]
                            df_ede_bootstrapping = pd.DataFrame({'dataset': f'{dataset_name}',
                                                                 'drift_type': f'{drift_type}',
                                                                 'drift_width': f'{drift_width_column}',
                                                                 'detector' : 'ede',
                                                                 'distance/test': test, 
                                                                 'drift_batches': drifts,
                                                                 'noise_value': noise_value,
                                                                 'balance_type': balance_type})
                                                        

                            # kdqTrees

                            kl_drift = []
                            mh_drift = []
                            cbs_drift = []
                            ksnk_drift = []
                            csn_drift = []
                            sqe_drift = []
                            bct_drift = []

                            for i in tqdm(range(0, len(distributions_test))):
                                if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'kl_divergence') == 1):
                                    kl_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'manhattan') == 1):
                                    mh_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'chebyshev') == 1):
                                    cbs_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'kulsinski') == 1):
                                    ksnk_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'cosine') == 1):
                                    csn_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'squared_euclidean') == 1):
                                    sqe_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train, distributions_test[i], distributions_bootstrapping, 'bhattacharyya') == 1):
                                    bct_drift.append(i)
                                #print(i)

                            distances = ['kl_div', 'manhattan', 'chebysev', 'kulsinski', 'cosine', 'sq_euclid', 'batthacrya']
                            drifts = [kl_drift, mh_drift, cbs_drift, ksnk_drift, csn_drift, sqe_drift, bct_drift]
                            df_kdqtrees_bootstrapping = pd.DataFrame({'dataset': f'{dataset_name}',
                                                                      'drift_type': f'{drift_type}',
                                                                      'drift_width': f'{drift_width_column}',
                                                                      'detector' : 'kdqTrees',
                                                                      'distance/test': distances, 
                                                                      'drift_batches': drifts,
                                                                      'noise_value': noise_value,
                                                                      'balance_type': balance_type})

                            # PCA-kdqTrees

                            kl_drift = []
                            mh_drift = []
                            cbs_drift = []
                            ksnk_drift = []
                            csn_drift = []
                            sqe_drift = []
                            bct_drift = []

                            for i in tqdm(range(0, len(distributions_test_pca))):

                                if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'kl_divergence') == 1):
                                    kl_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'manhattan') == 1):
                                    mh_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'chebyshev') == 1):
                                    cbs_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'kulsinski') == 1):
                                    ksnk_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'cosine') == 1):
                                    csn_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'squared_euclidean') == 1):
                                    sqe_drift.append(i)
                                if(kdqtrees_bootstrapping(distribution_train_pca, distributions_test_pca[i], distributions_bootstrapping_pca, 'bhattacharyya') == 1):
                                    bct_drift.append(i)
                                #print(i)

                            distances = ['kl_div', 'manhattan', 'chebysev', 'kulsinski', 'cosine', 'sq_euclid', 'batthacrya']
                            drifts = [kl_drift, mh_drift, cbs_drift, ksnk_drift, csn_drift, sqe_drift, bct_drift]
                            df_pca_bootstrapping = pd.DataFrame({'dataset': f'{dataset_name}',
                                                                    'drift_type': f'{drift_type}',
                                                                    'drift_width': f'{drift_width_column}',
                                                                    'detector' : 'pca',
                                                                    'distance/test': distances, 
                                                                    'drift_batches': drifts,
                                                                    'noise_value': noise_value,
                                                                    'balance_type': balance_type})

                            merged_results = pd.concat([df_ede_bootstrapping, df_kdqtrees_bootstrapping, df_pca_bootstrapping])
                            merged_results = merged_results.reset_index(drop=True)

                            # Evaluation metrics

                            abrupt_drift_start = 2
                            gradual_drift_start = 2

                            false_positive_rate = []
                            latency = []

                            for i in range(0, len(merged_results.drift_batches)):

                                false_positive_rate.append(compute_metric_false_positive(merged_results.drift_batches[i]))
                                latency.append(compute_metric_latency(merged_results.drift_batches[i]))

                            merged_results['false_positive_rate'] = false_positive_rate
                            merged_results['latency'] = latency
                            merged_results['random_seed'] = [random_seed for i in range(len(merged_results.dataset))]

                            final_results_df = pd.concat([final_results_df, merged_results])


In [None]:
final_results_df.to_csv('./DDB_drift_detection_synthetic_data.csv')