Observations one hot encoded data and drift detectors performances:

- Experiment 1: One hot encoding of categorical data & no scaling - 
- Experiment 2: One hot encoding of categorical data & MinMaxScaling - 
- Experiment 3: One hot encoding of categorical data & StandardScaler - 



In [None]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from skmultiflow.trees import HoeffdingTreeClassifier
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
import matplotlib.pyplot as plt
from skmultiflow.drift_detection import DDM
from skmultiflow.drift_detection import EDDM
from skmultiflow.drift_detection import ADWIN
from skmultiflow.drift_detection import HDDM_A
from skmultiflow.drift_detection import HDDM_W
from imblearn.over_sampling import SMOTE
from collections import Counter
import itertools

import datetime
import warnings
from IPython.display import clear_output

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
length_of_train = 30000
length_of_test = 10000

# Functions

In [None]:
def create_train(dataset, length_of_train):
    X_train = df.iloc[0:length_of_train]
    X_train = X_train.drop(['class'], axis = 1)
    y_train = df['class'][0:length_of_train]
    return X_train, y_train

In [None]:
def create_test(dataset, start_from_sample, length_of_test):
    X_test = df.iloc[start_from_sample:start_from_sample+length_of_test]
    X_test = X_test.drop(['class'], axis = 1)
    y_test = df['class'][start_from_sample:start_from_sample+length_of_test]
    return X_test, y_test

In [None]:
# Others have: 1 for incorrect prediction and 0 for correct prediction
def drift_detect_function(drift_detector, y_pred):
    baches_with_drift = []
    # go through every batch
    for i in tqdm(range(0, int((len(df)-length_of_train)/length_of_test))):
        # go through every label of one batch
        for j in range(0, len(list(y_test))):
            label_difference = abs(list(y_test)[j] - y_pred[i][j])
            drift_detector.add_element(label_difference)
            if(drift_detector.detected_change()):
                #print('Drift in performance detected at sample {}'.format(i))
                baches_with_drift.append(i)
    return baches_with_drift

In [None]:
# ADWIN has: 1 for correct prediction and 0 for incorrect prediction
def drift_detect_function_ADWIN(drift_detector, y_pred):
    # go through every batch
    batches_with_drift = []
    for i in tqdm(range(0, int((len(df)-length_of_train)/length_of_test))):
        # go through every label of one batch
        for j in range(0, len(list(y_test))):
            label_difference = abs(list(y_test)[j] - y_pred[i][j])
            label_difference_final = 1 - label_difference
            drift_detector.add_element(label_difference_final)
            if(drift_detector.detected_change()):
                #print('Drift in performance detected at sample {}'.format(i))
                batches_with_drift.append(i)
    return batches_with_drift

In [None]:
def extract_times_detection(array):
    results = []

    
    if(len(array)==0):
        results = 'nothing_detected'
    else:
        for i in range(0, len(array)):
            results.append(int(array[i]))
    return results

In [None]:
def compute_metric_false_positive(array_batches):
    
    drift_start = 2
    
    if(len(array_batches)>0): 
        
        if(len([x for x in array_batches if x<drift_start])>0):
            return(len([x for x in array_batches if x<drift_start])/drift_start)
        else:
            return 0
        
    else:
        return 'nothing_detected'

In [None]:
def compute_metric_latency(array_batches):
    no_batches_with_drift = 5
    drift_start = 2
    
    if(len(array_batches)>0):  
        #print(np.array(array_pr)>=drift_type_start)
        #print(np.argwhere(np.array(array_pr)>=drift_type_start).size==0)
        
        if(np.argwhere(np.array(array_batches)>=drift_start).size==0):
            latency_score = 'nothing_detected' 
        else:
            batch_drift_detected = array_batches[np.argwhere(np.array(array_batches)>=drift_start)[0][0]]
            latency_score = (batch_drift_detected - drift_start)/no_batches_with_drift
        return latency_score
    else:
        return 'nothing_detected'

In [None]:
warnings.filterwarnings('ignore')

In [None]:
datasets = ['sea', 'agraw1', 'agraw2']
drift_types = ['abrupt', 'gradual']
random_seeds = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
driftwidth_values = ['05', '1', '5', '10', '20']
noise_values = [0, 10, 20]
balance_types = ['imbalanced', 'balanced']


final_results_df = pd.DataFrame(columns=['dataset', 'drift_type', 'drift_width', 'detector', 'distance/test', 
                                         'drift_batches','noise_value', 'balance_type', 'false_positive_rate', 
                                         'latency', 'random_seed'])


for data_name in datasets:
    for noise_value in noise_values:
        for balance_type in balance_types:
            for random_seed in random_seeds:
                for drift_type in drift_types:
                        for driftwidth_value in driftwidth_values:

                            # to only go through abrupt once
                            if drift_type == 'abrupt' and driftwidth_values.index(driftwidth_value) > 0:
                                break

                            #path for local
                            path = '../../../../Documents/phd_related/data_sets_concept_drift/moa_datasets/'


                            dataset_name = data_name


                            drift_width = '' if drift_type != 'gradual' else f'_{driftwidth_value}'
                            drift_width_column = drift_width if drift_type == 'abrupt' else driftwidth_value

                            dataset_path = path + f'{dataset_name}_{random_seed}_{drift_type}_drift_{noise_value}_noise_{balance_type}{drift_width}.arff' 

                            #print(dataset_path)
                            # count how much it takes to run
                            #now = datetime.datetime.now()
                            #print('hour '+ str(int(now.hour)) + ' minute ' +  str(int(now.minute)) +' second ' + str(int(now.second)))

                            # read data 
                            data = arff.loadarff(dataset_path)

                            df = pd.DataFrame(data[0])
                            df = df.replace(df['class'].unique()[0], 0)
                            df = df.replace(df['class'].unique()[1], 1)

                            # extract labels

                            labels = list(df['class'])

                            # one hot encoding if needed

                            if(dataset_name.startswith('agraw')):
                                one_hot_encoded_data = pd.get_dummies(df, columns = ['elevel', 'car', 'zipcode'])
                                df = one_hot_encoded_data

                            # split train test

                            X_train, y_train = create_train(df, length_of_train)
                            
                            # in case of class imbalance, apply smote to the training data to balance the classes
                            
                            if(balance_type == 'imbalanced'):
                                oversample = SMOTE()
                                X_train, y_train = oversample.fit_resample(X_train, y_train)

                            # Train models

                            gnb = GaussianNB()
                            naive_bayes = gnb.fit(X_train, y_train)

                            ht = HoeffdingTreeClassifier()
                            hoeffding_tree = ht.fit(np.array(X_train), np.array(y_train))

                            adb = AdaBoostClassifier(n_estimators= 100, random_state=0)
                            adaboost = adb.fit(X_train, y_train)

                            xgb = XGBClassifier()
                            xgboost = xgb.fit(X_train, y_train)

                            lgb = LGBMClassifier()
                            lightgbm = lgb.fit(X_train, y_train)

                            # Test on testing batches 

                            start_from_sample = length_of_train

                            y_pred_naivebayes = []
                            y_pred_hoftree = []
                            y_pred_adaboost = []
                            y_pred_xgboost = []
                            y_pred_lightgbm = []


                            for i in tqdm(range(0, int((len(df)-length_of_train)/length_of_test))):
                                X_test, y_test = create_test(df, start_from_sample, length_of_test)
                                start_from_sample = start_from_sample + length_of_test
                                # naive bayes
                                y_pred_nb = naive_bayes.predict(X_test)
                                y_pred_naivebayes.append(y_pred_nb)

                                # hoeffding tree
                                y_pred_ht = hoeffding_tree.predict(np.array(X_test))
                                y_pred_hoftree.append(y_pred_ht)
                                # adaboost
                                y_pred_adb = adaboost.predict(X_test)
                                y_pred_adaboost.append(y_pred_adb)
                                # xgboost
                                y_pred_xgb = xgboost.predict(X_test)
                                y_pred_xgboost.append(y_pred_xgb)
                                # lightgbm
                                y_pred_lgbm = lightgbm.predict(X_test)
                                y_pred_lightgbm.append(y_pred_lgbm)



                            y_pred_dict = {'NaiveBayes': y_pred_naivebayes, 
                               'HoeffdingTree' : y_pred_hoftree, 
                               'Adaboost' : y_pred_adaboost,
                               'XGBoost' : y_pred_xgboost,
                               'LightGBM' : y_pred_lightgbm}


                            # Drift Detectors 

                            drift_batches = []
                            classifiers = []
                            detectors = []

                            for i in tqdm(range(0, len(list(y_pred_dict.keys())))):

                                classifiers.append(list(y_pred_dict.keys())[i])
                                print(list(y_pred_dict.keys())[i])

                                #detector DDM
                                detectors.append('DDM')

                                #print(list(y_pred_dict.values())[i])
                                drifts_ddm = drift_detect_function(DDM(), list(y_pred_dict.values())[i])
                                drifts_ddm = list(dict.fromkeys(drifts_ddm))

                                drift_batches.append(drifts_ddm)
                                #detector EDDM
                                detectors.append('EDDM')

                                drifts_eddm = drift_detect_function(EDDM(), list(y_pred_dict.values())[i])
                                drifts_eddm = list(dict.fromkeys(drifts_eddm))

                                drift_batches.append(drifts_eddm)
                                #detector ADWIN
                                detectors.append('ADWIN')

                                drifts_adwin = drift_detect_function_ADWIN(ADWIN(), list(y_pred_dict.values())[i])
                                drifts_adwin = list(dict.fromkeys(drifts_adwin))

                                drift_batches.append(drifts_adwin)
                                #detector HDDM_W
                                detectors.append('HDDM_W')

                                drifts_hddmw = drift_detect_function(HDDM_W(), list(y_pred_dict.values())[i])
                                drifts_hddmw = list(dict.fromkeys(drifts_hddmw))

                                drift_batches.append(drifts_hddmw)
                                # detector HDDM_A
                                detectors.append('HDDM_A')

                                drifts_hddma = drift_detect_function(HDDM_A(), list(y_pred_dict.values())[i])
                                drifts_hddma = list(dict.fromkeys(drifts_hddma))

                                drift_batches.append(drifts_hddma)

                            # make each classifier appear 5 times because we have 5 drift detectors
                            classifiers_df = list(itertools.chain.from_iterable(itertools.repeat(x, 5) for x in classifiers))

                            # ensure that each element in drift batches is a list
                            for i in range(0, len(drift_batches)):
                                drift_batches[i] = list(drift_batches[i])

                            test_length = [length_of_test for i in range(len(classifiers_df))]

                            df_error_based = pd.DataFrame({'dataset' : f'{dataset_name}',
                                                               'drift_type': f'{drift_type}',
                                                               'drift_width': f'{drift_width_column}',
                                                               'detector': detectors,
                                                               'classifier': classifiers_df, 
                                                               'drift_batches': drift_batches,
                                                               'noise_value': noise_value,
                                                               'balance_type': balance_type})



                            # Evaluation metrics

                            abrupt_drift_start = 2
                            gradual_drift_start = 2

                            false_positive_rate = []
                            latency = []

                            for i in range(0, len(df_error_based.drift_batches)):

                                false_positive_rate.append(compute_metric_false_positive(df_error_based.drift_batches[i]))
                                latency.append(compute_metric_latency(df_error_based.drift_batches[i]))

                            df_error_based['false_positive_rate'] = false_positive_rate
                            df_error_based['latency'] = latency
                            df_error_based['random_seed'] = [random_seed for i in range(len(df_error_based.dataset))]

                            final_results_df = pd.concat([final_results_df, df_error_based])

                            clear_output(wait=True)

In [None]:
final_results_df.to_csv('./ERB_drift_detection_synthetic_data.csv')