In [18]:
%matplotlib notebook 
import pandas as pd
import numpy as np
import pickle
import itertools
from datetime import datetime
from tqdm import tqdm
import math 

In [19]:
from importnb import Notebook
ShapDetectorClass = Notebook.load('Backend.ipynb')
ShapDetector = ShapDetectorClass.ShapDetector

In [20]:
# complete data set
data_complete = pd.read_csv("../Data_prep/own_synthetic/mixed_train_test.csv") 

# data to be processed (can be the complete data set or the validation set)
data = pd.read_csv("../Data_prep/own_synthetic/mixed_train_test.csv") 

# random sample from training data --> only used in real-world scenarios
initial_batch_sample = ''

In [21]:
data_full = data.copy()

In [22]:
data.describe()

Unnamed: 0,x1,x2,x3,x4,label
count,20000.0,20000.0,20000.0,20000.0,20000.0
mean,0.5061,0.49285,0.33898,0.333165,0.58825
std,0.499975,0.499961,0.327367,0.325051,0.492163
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.1,0.1,0.0
50%,1.0,0.0,0.1,0.1,1.0
75%,1.0,1.0,0.6,0.6,1.0
max,1.0,1.0,1.0,1.0,1.0


Data set dependent parameters dynamic

In [25]:
# synthetic data
initial_batch_size_in = math.trunc(0.05*len(data_complete)) #5% of complete data set (20000)
retrainsize = math.trunc(0.01*len(data_complete)) #1% of complete data set (20000)
true_drifts = [4000, 6500, 9000, 11500, 14000, 16500] 

print('true drift points:', true_drifts, ', size initial batch:', initial_batch_size_in, ', retrainsize:', retrainsize, ', nr. of classes:', len(data['label'].unique()))
data_full = data.copy()

true drift points: [4000, 6500, 9000, 11500, 14000, 16500] , size initial batch: 1000 , retrainsize: 200 , nr. of classes: 2


In [15]:
sparse_labels = [ShapDetector.make_sparse(label, sparsity=100) for label in zip(data_full['label'].copy())]
data['label'] = sparse_labels

#### prepare grids and start grid search

- 0 sparsitiy: all labels available
- 100 sparsity: no labels available
- perf: performance-based approach
- ph: Page-Hinkley test
- ad: ADWIN change detector
- kswin: KSWIN change detector

In [17]:
grid_vals_0_sparsity_ph={
    # detection
    "sparsity": [0],
    "base_detector": ['ph'],
    "%-Labels": [0],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0.99],
    "delta": [0.005],
    "min_instances": [100],
    "thresold": [0.05],
    
    # adwin
    "ad_delta": [0],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [False],
    
    "sampling": ['-']  
}

grid_vals_0_sparsity_ad={
    # detection
    "sparsity": [0],
    "base_detector": ['adwin'],
    "%-Labels": [0],
    "retrainsize": [retrainsize], 

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0],
    
    # adwin
    "ad_delta": [0.25],         
    
    # kswin
    "ks_alpha": [0],
    
                                        
    # performance based
    "performance": [False],
    
    "sampling": ['-']
}

grid_vals_0_sparsity_ph_perf={ 
    # detection
    "sparsity": [0],
    "base_detector": ['ph'],
    "%-Labels": [0],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0.99],
    "delta": [0.005],
    "min_instances": [100],
    "thresold": [0.05],
    
    # adwin
    "ad_delta": [0],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [True],
    
    "sampling": ['-']
}

grid_vals_0_sparsity_ad_perf={
    # detection
    "sparsity": [0],
    "base_detector": ['adwin'],
    "%-Labels": [0],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0],
    
    # adwin
    "ad_delta": [0.002],
    
    # kswin
    "ks_alpha": [0],
                                        
    # performance based
    "performance": [True],
    
    "sampling": ['-']
}

grid_vals_100_sparsity_ph={
    # detection
    "sparsity": [100],
    "base_detector": ['ph'],
    "%-Labels": [20,40,60,70,80],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0.99],
    "delta": [0.005],
    "min_instances": [100],
    "thresold": [3.9],
    
    # adwin
    "ad_delta": [0],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [False],
    
    "sampling": ['margin', 'entropy']
}

grid_vals_100_sparsity_ad={
    # detection
    "sparsity": [100],
    "base_detector": ['adwin'],
    "%-Labels": [20,40,60,80,90],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0],
    
    # adwin
    "ad_delta": [1],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [False],
    
    "sampling": ['margin']
    
}


grid_vals_100_sparsity_ad_perf={
    # detection
    "sparsity": [100],
    "base_detector": ['adwin'],
    "%-Labels": [20,40,60,80,90],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0],
    
    # adwin
    "ad_delta": [0.002],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [True],
    
    "sampling": ['margin']
    
}

grid_vals_no_retraining={
    # detection
    "sparsity": [0],
    "base_detector": ['adwin'],
    "%-Labels": [0],
    "retrainsize": [0],

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0], 
    
    # adwin
    "ad_delta": [9999],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [True],
    
    "sampling": ['-']
}



grid_list = [grid_vals_0_sparsity_ad] 

for grid in grid_list:
    parameters,values = zip(*grid.items())
    for value_vector in tqdm(itertools.product(*values)):
            params = dict(zip(parameters, value_vector))
            data_name=datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")[:-3]
            detector = ShapDetector(
                base_detector_type = params['base_detector'],
                base_detector_config = {"alpha" : params['alpha'],
                                        "delta" : params['delta'],
                                        "min_instances" : params['min_instances'],
                                        "threshold" : params['thresold'],
                                        "ad_delta" : params['ad_delta'],
                                        "ks_alpha" : params['ks_alpha']
                }
            )
            
            detector_res = detector.detect_drift(
                data_sparse = data,
                data_full = data_full,
                sparsity = params['sparsity'],
                initial_batch_size = initial_batch_size_in,
                initial_batch_sample = initial_batch_sample,
                samplesize = 1, # 1 corresponds to instance-wise processing. Batch-wise procressing is not fully implemented and might cause errors
                retrainsize = params['retrainsize'],
                distance_measure = 'euclidean',
                clf = None,
                al_percentage = params['%-Labels'],
                uncertainty_threshold = 0.5,
                true_drift_points = true_drifts,
                err_based = params['performance'],
                
                multiclass = False, 
                amount_classes = 2, 
                approach = 2, #|1  #approach1 = Log-Odds for real world data / approach2 = proba
                real_world = False, 
                sampling = params['sampling'],
            )
            pickle.dump(detector_res, open("../Results/Detector_objs/{0}.pickle".format(data_name), "wb" ) )

0it [00:00, ?it/s]

start
it 0
it 500
it 1000
it 1500
it 2000
it 2500
it 3000
it 3500
it 4000
it 4500
it 5000
it 5500
it 6000
it 6500
it 7000
it 7500
it 8000
it 8500
it 9000
Drift, No. of iterations: 9247 Samples:  9247
it 9500
it 10000
it 10500
it 11000
it 11500
it 12000
it 12500
it 13000
it 13500
Drift, No. of iterations: 13855 Samples:  13855
it 14000
Drift, No. of iterations: 14303 Samples:  14303
it 14500
it 15000
it 15500
it 16000
it 16500
it 17000
it 17500
it 18000
it 18500
Drift, No. of iterations: 18911 Samples:  18911


1it [02:12, 132.23s/it]
0it [00:00, ?it/s]

start
it 0
it 500
it 1000
it 1500
it 2000
it 2500
it 3000
it 3500
it 4000
Drift, No. of iterations: 4063 Samples:  4063
it 4500
it 5000
it 5500
it 6000
it 6500
it 7000
it 7500
it 8000
it 8500
it 9000
it 9500
it 10000
it 10500
Drift, No. of iterations: 10655 Samples:  10655
it 11000
it 11500
it 12000
it 12500
it 13000
it 13500
it 14000
Drift, No. of iterations: 14143 Samples:  14143
it 14500
it 15000
it 15500
it 16000
it 16500
it 17000
it 17500
it 18000
it 18500


1it [01:18, 78.73s/it]


In [12]:
'''[0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  , 1.05, 1.1 ,
       1.15, 1.2 , 1.25, 1.3 , 1.35, 1.4 , 1.45, 1.5 , 1.55, 1.6 , 1.65,
       1.7 , 1.75, 1.8 , 1.85, 1.9 , 1.95, 2.  , 2.05, 2.1 , 2.15, 2.2 ,
       2.25, 2.3 , 2.35, 2.4 , 2.45, 2.5 , 2.55, 2.6 , 2.65, 2.7 , 2.75,
       2.8 , 2.85, 2.9 , 2.95, 3.  , 3.05, 3.1 , 3.15, 3.2 , 3.25, 3.3 , 
       3.35, 3.4 , 3.45, 3.5 , 3.55, 3.6 , 3.65, 3.7 , 3.75, 3.8 , 3.85, 3.9 , 3.95, 
       0.025, 0.05 , 0.075, 0.1  , 0.125, 0.15 , 0.175, 0.2  ,
       0.225, 0.25 , 0.275, 0.3  , 0.325, 0.35 , 0.375, 0.4  , 0.425,
       0.45 , 0.475, 0.5  , 0.525, 0.55 , 0.575, 0.6  , 0.625, 0.65 ,
       0.675, 0.7  , 0.725, 0.75 , 0.775, 0.8  , 0.825, 0.85 , 0.875,
       0.9  , 0.925, 0.95 , 0.975, 1.   , 1.025, 1.05 , 1.075, 1.1  ,
       1.125, 1.15 , 1.175, 1.2  , 1.225, 1.25 , 1.275, 1.3  , 1.325,
       1.35 , 1.375, 1.4  , 1.425, 1.45 , 1.475, 1.5  , 1.525, 1.55 ,
       1.575, 1.6  , 1.625, 1.65 , 1.675, 1.7  , 1.725, 1.75 , 1.775,
       1.8  , 1.825, 1.85 , 1.875, 1.9  , 1.925, 1.95 , 1.975, 2.   ,
       2.025, 2.05 , 2.075, 2.1  , 2.125, 2.15 , 2.175, 2.2  , 2.225,
       2.25 , 2.275, 2.3  , 2.325, 2.35 , 2.375, 2.4  , 2.425, 2.45 ,
       2.475, 2.5  , 2.525, 2.55 , 2.575, 2.6  , 2.625, 2.65 , 2.675,
       2.7  , 2.725, 2.75 , 2.775, 2.8  , 2.825, 2.85 , 2.875, 2.9  ,
       2.925, 2.95 , 2.975, 3.   , 3.025, 3.05 , 3.075, 3.1  , 3.125,
       3.15 , 3.175, 3.2  , 3.225, 3.25 , 3.275, 3.3  , 3.325, 3.35 ,
       3.375, 3.4  , 3.425, 3.45 , 3.475, 3.5  , 3.525, 3.55 , 3.575,
       3.6  , 3.625, 3.65 , 3.675, 3.7  , 3.725, 3.75 , 3.775, 3.8  ,
       3.825, 3.85 , 3.875, 3.9  , 3.925, 3.95 , 3.975]
       
       
       
 [0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
       1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. ,
       3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2,
       5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0]

       
'''

'[0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,\n       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  , 1.05, 1.1 ,\n       1.15, 1.2 , 1.25, 1.3 , 1.35, 1.4 , 1.45, 1.5 , 1.55, 1.6 , 1.65,\n       1.7 , 1.75, 1.8 , 1.85, 1.9 , 1.95, 2.  , 2.05, 2.1 , 2.15, 2.2 ,\n       2.25, 2.3 , 2.35, 2.4 , 2.45, 2.5 , 2.55, 2.6 , 2.65, 2.7 , 2.75,\n       2.8 , 2.85, 2.9 , 2.95, 3.  , 3.05, 3.1 , 3.15, 3.2 , 3.25, 3.3 , \n       3.35, 3.4 , 3.45, 3.5 , 3.55, 3.6 , 3.65, 3.7 , 3.75, 3.8 , 3.85, 3.9 , 3.95, \n       0.025, 0.05 , 0.075, 0.1  , 0.125, 0.15 , 0.175, 0.2  ,\n       0.225, 0.25 , 0.275, 0.3  , 0.325, 0.35 , 0.375, 0.4  , 0.425,\n       0.45 , 0.475, 0.5  , 0.525, 0.55 , 0.575, 0.6  , 0.625, 0.65 ,\n       0.675, 0.7  , 0.725, 0.75 , 0.775, 0.8  , 0.825, 0.85 , 0.875,\n       0.9  , 0.925, 0.95 , 0.975, 1.   , 1.025, 1.05 , 1.075, 1.1  ,\n       1.125, 1.15 , 1.175, 1.2  , 1.225, 1.25 , 1.275, 1.3  , 1.325,\n       1.35 , 1.375, 1.4  , 1.425, 1.45 , 1.475, 1.5  