In [9]:
%matplotlib notebook
import pandas as pd
import pickle
import itertools
from datetime import datetime
from tqdm import tqdm
import math 

In [10]:
from importnb import Notebook
ShapDetectorClass = Notebook.load('Backend_ROC.ipynb')
ShapDetector = ShapDetectorClass.ShapDetector

#### import data

In [11]:
# complete data set
data_complete = pd.read_csv("../Data_prep/insects_inc_abrupt_train_test_red.csv")

# data to be processed (can be the complete data set or the validation set)
data = pd.read_csv("../Data_prep/insects_inc_abrupt_train_test_red.csv")

# random sample from training data
initial_batch_sample = pd.read_csv("../Data_prep/insects_inc_abrupt_10_sample_red.csv")

In [12]:
data_full = data.copy()

In [13]:
data.describe()

Unnamed: 0,Att1,Att2,Att3,Att4,Att6,Att7,Att8,Att15,Att16,Att17,...,Att20,Att21,Att22,Att25,Att26,Att27,Att28,Att30,Att32,label
count,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,...,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0,79986.0
mean,0.279467,0.248287,0.187459,0.221078,0.309888,0.313936,0.048897,0.044275,0.053897,0.04876,...,0.054228,0.040696,0.034688,0.0461,0.03293,0.035914,0.044495,0.046209,0.038701,2.5
std,0.129473,0.130165,0.104216,0.122271,0.178702,0.113951,0.0519,0.059173,0.065707,0.061784,...,0.069382,0.055716,0.04944,0.067432,0.047196,0.0505,0.063509,0.070968,0.061966,1.707836
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.181397,0.141399,0.148862,0.170729,0.23396,0.232507,0.01393,0.008962,0.011358,0.009762,...,0.01285,0.009079,0.008028,0.009764,0.006439,0.007165,0.008902,0.010307,0.00967,1.0
50%,0.266227,0.235301,0.198291,0.233794,0.335274,0.288813,0.03239,0.02314,0.029263,0.025408,...,0.028001,0.019491,0.016577,0.019807,0.013495,0.014932,0.018254,0.018496,0.016119,2.5
75%,0.344016,0.338492,0.252668,0.29453,0.415892,0.370799,0.065146,0.054064,0.071007,0.063373,...,0.066482,0.048489,0.038925,0.049517,0.037112,0.041653,0.050271,0.045663,0.03587,4.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0


#### prepare processing

In [6]:
# real world data
#true_drifts = [10000, 11610, 15100, 16858, 25000, 30598, 35500, 36040, 36868, 44100, 49450] # insects abrupt 
true_drifts = [3850, 11500, 13800, 16500, 18200, 22568, 26000, 30100, 37200, 41150, 43500, 45600, 49364, 57100, 64500, 67900, 69800, 71200] # insects incremental abrupt
initial_batch_size_in = math.trunc(0.05*len(data_complete))   
retrainsize = math.trunc(0.01*len(data_complete))   

print('true drift points:', true_drifts, ', size initial batch:', initial_batch_size_in, ', retrainsize:', retrainsize, ', nr. of classes:', len(data['label'].unique()))
data_full = data.copy()

true drift points: [3850, 11500, 13800, 16500, 18200, 22568, 26000, 30100, 37200, 41150, 43500, 45600, 49364, 57100, 64500, 67900, 69800, 71200] , size initial batch: 3999 , retrainsize: 799 , nr. of classes: 6


In [7]:
sparse_labels = [ShapDetector.make_sparse(label, sparsity=100) for label in zip(data_full['label'].copy())]
data['label'] = sparse_labels

#### prepare grids and start grid search

- 0 sparsitiy: all labels available
- 100 sparsity: no labels available
- perf: performance-based approach
- ph: Page-Hinkley test
- ad: ADWIN change detector
- kswin: KSWIN change detector

In [None]:
grid_vals_0_sparsity_ph={
    # detection
    "sparsity": [0],
    "base_detector": ['ph'],
    "%-Labels": [0],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0.99],
    "delta": [0.005],
    "min_instances": [100],
    "thresold": [0.05],
    
    # adwin
    "ad_delta": [0],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [False],
    
    "sampling": ['-']  
}

grid_vals_0_sparsity_ad={
    # detection
    "sparsity": [0],
    "base_detector": ['adwin'],
    "%-Labels": [0],
    "retrainsize": [retrainsize], 

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0],
    
    # adwin
    "ad_delta": [0.025],         
    
    # kswin
    "ks_alpha": [0],
    
                                        
    # performance based
    "performance": [False],
    
    "sampling": ['-']
}

grid_vals_0_sparsity_ph_perf={ 
    # detection
    "sparsity": [0],
    "base_detector": ['ph'],
    "%-Labels": [0],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0.99],
    "delta": [0.005],
    "min_instances": [100],
    "thresold": [0.05],
    
    # adwin
    "ad_delta": [0],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [True],
    
    "sampling": ['-']
}

grid_vals_0_sparsity_ad_perf={
    # detection
    "sparsity": [0],
    "base_detector": ['adwin'],
    "%-Labels": [0],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0],
    
    # adwin
    "ad_delta": [0.002],
    
    # kswin
    "ks_alpha": [0],
                                        
    # performance based
    "performance": [True],
    
    "sampling": ['-']
}

grid_vals_100_sparsity_ph={
    # detection
    "sparsity": [100],
    "base_detector": ['ph'],
    "%-Labels": [20,30,40,50,60,70,80],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0.99],
    "delta": [0.005],
    "min_instances": [100],
    "thresold": [3.9],
    
    # adwin
    "ad_delta": [0],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [False],
    
    "sampling": ['margin', 'entropy']
}

grid_vals_100_sparsity_ad={
    # detection
    "sparsity": [100],
    "base_detector": ['adwin'],
    "%-Labels": [20,40,60,80,90],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0],
    
    # adwin
    "ad_delta": [1],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [False],
    
    "sampling": ['margin']
    
}


grid_vals_100_sparsity_ad_perf={
    # detection
    "sparsity": [100],
    "base_detector": ['adwin'],
    "%-Labels": [20,40,60,80,90],
    "retrainsize": [retrainsize],

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0],
    
    # adwin
    "ad_delta": [0.002],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [True],
    
    "sampling": ['margin']
    
}

grid_vals_no_retraining={
    # detection
    "sparsity": [0],
    "base_detector": ['adwin'],
    "%-Labels": [0],
    "retrainsize": [0],

    # ph
    "alpha": [0],
    "delta": [0],
    "min_instances": [100],
    "thresold": [0], 
    
    # adwin
    "ad_delta": [9999],
    
    # kswin
    "ks_alpha": [0],
    
    # performance based
    "performance": [True],
    
    "sampling": ['-']
}

# choose grids to process
grid_list = [grid_vals_0_sparsity_ad] 

for grid in grid_list:
    parameters,values = zip(*grid.items())
    for value_vector in tqdm(itertools.product(*values)):
            params = dict(zip(parameters, value_vector))
            data_name=datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")[:-3]
            detector = ShapDetector(
                base_detector_type = params['base_detector'],
                base_detector_config = {"alpha" : params['alpha'],
                                        "delta" : params['delta'],
                                        "min_instances" : params['min_instances'],
                                        "threshold" : params['thresold'],
                                        "ad_delta" : params['ad_delta'],
                                        "ks_alpha" : params['ks_alpha']
                }
            )
            
            detector_res = detector.detect_drift(
                data_sparse = data,
                data_full = data_full,
                sparsity = params['sparsity'],
                initial_batch_size = initial_batch_size_in,
                initial_batch_sample = initial_batch_sample,
                samplesize = 1, # 1 corresponds to instance-wise processing. Batch-wise procressing is not fully implemented and might cause errors
                retrainsize = params['retrainsize'],
                distance_measure = 'euclidean',
                clf = None,
                al_percentage = params['%-Labels'],
                uncertainty_threshold = 0.5,
                true_drift_points = true_drifts,
                err_based = params['performance'],
                
                multiclass = True,  #|False
                amount_classes = 6, #|2
                approach = 1,       #|1  |approach1 = Log-Odds for real world data | approach2 = probability scale for synthetic data
                real_world = True,  #|False --> if this parameter is set to false, the random sample from the training set is witheld from the retraining set. In real-world scenarios it is set to True
                sampling = params['sampling'],
            )
            
            # store object
            pickle.dump(detector_res, open("../Results/Detector_objs/{0}.pickle".format(data_name), "wb" ) )

0it [00:00, ?it/s]NaN values found. Functionality is not guaranteed for some methods.Proceed with caution.


start
it 0
it 500
it 1000
it 1500
it 2000
it 2500
it 3000
it 3500
it 4000
Drift, No. of iterations: 4095 Samples:  4095
it 4500
it 5000
it 5500
it 6000
it 6500
it 7000
it 7500
it 8000
it 8500
it 9000
it 9500
it 10000
it 10500
it 11000
Drift, No. of iterations: 11263 Samples:  11263
it 11500
it 12000
it 12500
it 13000
it 13500
it 14000
it 14500
Drift, No. of iterations: 14591 Samples:  14591
it 15000
it 15500
it 16000
it 16500
it 17000
it 17500
Drift, No. of iterations: 17599 Samples:  17599
it 18000
it 18500
it 19000
Drift, No. of iterations: 19103 Samples:  19103
it 19500
it 20000
it 20500
it 21000
it 21500
it 22000
Drift, No. of iterations: 22239 Samples:  22239
it 22500
Drift, No. of iterations: 22623 Samples:  22623
it 23000
it 23500
it 24000
it 24500
Drift, No. of iterations: 24671 Samples:  24671
it 25000
it 25500
it 26000
it 26500
Drift, No. of iterations: 26591 Samples:  26591
it 27000
it 27500
it 28000
it 28500
it 29000
it 29500
it 30000
it 30500
Drift, No. of iterations: 3094

1it [1:07:51, 4071.71s/it]NaN values found. Functionality is not guaranteed for some methods.Proceed with caution.


start
it 0
it 500
it 1000
it 1500
it 2000
it 2500
it 3000
it 3500
it 4000
Drift, No. of iterations: 4095 Samples:  4095
it 4500
it 5000
it 5500
it 6000
it 6500
it 7000
it 7500
it 8000
it 8500
it 9000
it 9500
it 10000
it 10500
it 11000
it 11500
it 12000
it 12500
it 13000
it 13500
it 14000
it 14500
Drift, No. of iterations: 14719 Samples:  14719
it 15000
it 15500
it 16000
it 16500
it 17000
it 17500
Drift, No. of iterations: 17631 Samples:  17631
it 18000
it 18500
it 19000
Drift, No. of iterations: 19423 Samples:  19423
it 19500
it 20000
it 20500
it 21000
it 21500
Drift, No. of iterations: 21727 Samples:  21727
it 22000
it 22500
Drift, No. of iterations: 22623 Samples:  22623
it 23000
it 23500
it 24000
it 24500
it 25000
it 25500
it 26000
it 26500
it 27000
it 27500
it 28000
it 28500
it 29000
it 29500
it 30000
it 30500
Drift, No. of iterations: 30559 Samples:  30559
it 31000
it 31500
it 32000
it 32500
it 33000
it 33500
it 34000
it 34500
it 35000
it 35500
it 36000
it 36500
it 37000
it 37500


2it [2:40:17, 4513.98s/it]NaN values found. Functionality is not guaranteed for some methods.Proceed with caution.


start
it 0
it 500
it 1000
