In [1]:
import pandas as pd
from skmultiflow.data import DataStream
import copy
import xgboost as xgb
import math
import pickle
from datetime import datetime


from importnb import Notebook
ShapDetectorClass = Notebook.load('KS_Backend.ipynb')
ShapDetector = ShapDetectorClass.ShapDetector

In [2]:
import numpy as np
from scipy import stats
from skmultiflow.drift_detection.base_drift_detector import BaseDriftDetector


class KSWIN(BaseDriftDetector):
    r""" Kolmogorov-Smirnov Windowing method for concept drift detection.
    Parameters
    ----------
    alpha: float (default=0.005)
        Probability for the test statistic of the Kolmogorov-Smirnov-Test
        The alpha parameter is very sensitive, therefore should be set
        below 0.01.
    window_size: float (default=100)
        Size of the sliding window
    stat_size: float (default=30)
        Size of the statistic window
    data: numpy.ndarray of shape (n_samples, 1) (default=None,optional)
        Already collected data to avoid cold start.
    Notes
    -----
    KSWIN (Kolmogorov-Smirnov Windowing) [1]_ is a concept change detection method based
    on the Kolmogorov-Smirnov (KS) statistical test. KS-test is a statistical test with
    no assumption of underlying data distribution. KSWIN can monitor data or performance
    distributions. Note that the detector accepts one dimensional input as array.
    KSWIN maintains a sliding window :math:`\Psi` of fixed size :math:`n` (window_size). The
    last :math:`r` (stat_size) samples of :math:`\Psi` are assumed to represent the last
    concept considered as :math:`R`. From the first :math:`n-r` samples of :math:`\Psi`,
    :math:`r` samples are uniformly drawn, representing an approximated last concept :math:`W`.
    The KS-test is performed on the windows :math:`R` and :math:`W` of the same size. KS
    -test compares the distance of the empirical cumulative data distribution :math:`dist(R,W)`.
    A concept drift is detected by KSWIN if:
    * :math:`dist(R,W) > \sqrt{-\frac{ln\alpha}{r}}`
    -> The difference in empirical data distributions between the windows :math:`R` and :math:`W`
    is too large as that R and W come from the same distribution.
    References
    ----------
    .. [1] Christoph Raab, Moritz Heusinger, Frank-Michael Schleif, Reactive
       Soft Prototype Computing for Concept Drift Streams, Neurocomputing, 2020,
    Examples
    --------
    >>> # Imports
    >>> import numpy as np
    >>> from skmultiflow.data.sea_generator import SEAGenerator
    >>> from skmultiflow.drift_detection import KSWIN
    >>> import numpy as np
    >>> # Initialize KSWIN and a data stream
    >>> kswin = KSWIN(alpha=0.01)
    >>> stream = SEAGenerator(classification_function = 2,
    >>>     random_state = 112, balance_classes = False,noise_percentage = 0.28)
    >>> # Store detections
    >>> detections = []
    >>> # Process stream via KSWIN and print detections
    >>> for i in range(1000):
    >>>         data = stream.next_sample(10)
    >>>         batch = data[0][0][0]
    >>>         kswin.add_element(batch)
    >>>         if kswin.detected_change():
    >>>             print("\rIteration {}".format(i))
    >>>             print("\r KSWINReject Null Hyptheses")
    >>>             detections.append(i)
    >>> print("Number of detections: "+str(len(detections)))
    """

    def __init__(self, alpha=0.005, window_size=100, stat_size=30, data=None):
        super().__init__()
        self.window_size = window_size
        self.stat_size = stat_size
        self.alpha = alpha
        self.change_detected = False
        self.p_value = 0
        self.change_detected_pvalue = 0
        self.n = 0
        if self.alpha < 0 or self.alpha > 1:
            raise ValueError("Alpha must be between 0 and 1")

        if self.window_size < 0:
            raise ValueError("window_size must be greater than 0")

        if self.window_size < self.stat_size:
            raise ValueError("stat_size must be smaller than window_size")

        if not isinstance(data, np.ndarray) or data is None:
            self.window = np.array([])
        else:
            self.window = data

    def add_element(self, input_value):
        """ Add element to sliding window
        Adds an element on top of the sliding window and removes
        the oldest one from the window. Afterwards, the KS-test
        is performed.
        Parameters
        ----------
        input_value: ndarray
            New data sample the sliding window should add.
        """
        self.n += 1
        currentLength = self.window.shape[0]
        if currentLength >= self.window_size:
            self.window = np.delete(self.window, 0)
            rnd_window = np.random.choice(self.window[:-self.stat_size], self.stat_size)

            (st, self.p_value) = stats.ks_2samp(rnd_window,
                                                self.window[-self.stat_size:], mode="exact")

            if self.p_value <= self.alpha and st > 0.1:
                self.change_detected = True
                self.change_detected_pvalue = self.p_value
                self.window = self.window[-self.stat_size:]
            else:
                self.change_detected = False
        else:  # Not enough samples in sliding window for a valid test
            self.change_detected = False

        self.window = np.concatenate([self.window, [input_value]])

    def detected_change(self):
        """ Get detected change
        Returns
        -------
        bool
            Whether or not a drift occurred
        """
        return self.change_detected, self.change_detected_pvalue

    def reset(self):
        """ reset
        Resets the change detector parameters.
        """
        self.p_value = 0
        self.window = np.array([])
        self.change_detected = False


In [13]:
def ks_data(stream, model,  X_train, y_train, retrain, retrain_after, refit_use_Xtrain, adwin_retrainings , adwin_parameter, ks_features, targets):
    # Initialize KSWIN and a data stream
    kswin_dict = {}
    for i in range(ks_features):
        key = str(i)
        kswin_dict[key] = KSWIN(alpha=0.000001, window_size = 2000, stat_size=100, data=X_train.values[:,i::ks_features][0])
    
    p_values = []
    detected_drifts = []
    accepted_drifts = []
    
    X_test, y_test = stream.next_sample(stream.n_remaining_samples())
    print(len(X_test))

    for i in range(len(y_test)):
    # Process stream via KSWIN and print detections
        for feat in range(ks_features):
            batch = X_test[i][feat]
            kswin_dict[str(feat)].add_element(batch)
            detected, p_value = kswin_dict[str(feat)].detected_change()
            if detected:
                detected_drifts.append(i)
                
                if not accepted_drifts:
                    accepted_drifts.append(i)
                    p_values.append(p_value)

                elif i-accepted_drifts[-1]> retrain_after:
                    accepted_drifts.append(i)
                    p_values.append(p_value)

    indices = np.argsort(np.array(p_values))[:adwin_retrainings]
    retraining_points = np.sort(np.array(accepted_drifts)[indices])
    retraining_points = retraining_points.tolist()
    detected_drifts = copy.deepcopy(retraining_points)
    
    print(detected_drifts)
    print('----')

    return detected_drifts

In [14]:
data_complete = pd.read_csv("./Data_prep/insects_abrupt_train_test.csv")
initial_batch_sample = pd.read_csv("./Data_prep/insects_abrupt_10_sample.csv")

retrainsize = math.trunc(0.01*len(data_complete)) 

columns = []
for i in range(0,len(list(data_complete.columns))-1):
    columns.append('f'+str(i))
columns += ['label'] 
data_complete.columns = columns


X_train = data_complete.iloc[:math.trunc(0.05*len(data_complete)),:-1]
y_train = data_complete.iloc[:math.trunc(0.05*len(data_complete)),-1]

model = xgb.XGBClassifier(objective = 'multi:softprob', num_class = 6)
model.fit(X_train, y_train)


stream = DataStream(data = data_complete.iloc[math.trunc(0.05*len(data_complete)):,:], allow_nan = True)
retrain = None 
retrain_after = retrainsize
refit_use_Xtrain = True
adwin_retrainings = 7 #number of retrainings
adwin_parameter = None 
ks_features = len(X_train.columns)
targets = model.classes_

detected_drifts = ks_data(stream, model, X_train, y_train, retrain, retrain_after, refit_use_Xtrain, adwin_retrainings , adwin_parameter, ks_features, targets) 

50206
[10003, 17383, 25351, 30535, 36088, 44207, 50018]
----


In [15]:
data_complete = pd.read_csv("../Data_prep/insects_abrupt_train_test.csv")
data = pd.read_csv("../Data_prep/insects_abrupt_train_test.csv") 
initial_batch_sample = pd.read_csv("../Data_prep/insects_abrupt_10_sample.csv")

data_full = data.copy()
data.head()

# real world data
initial_batch_size_in = math.trunc(0.05*len(data_complete))   
retrainsize = math.trunc(0.01*len(data_complete))   

detector = ShapDetector(
                base_detector_type ='kswin',
                base_detector_config = {"alpha" : 0,
                                        "delta" : 0,
                                        "min_instances" : 100,
                                        "threshold" : 0,
                                        "ad_delta" : 0,
                                        "ks_alpha" : 0.000001
                }
            )
            
detector_res = detector.detect_drift(
                fix_drifts = detected_drifts, # Determined drifts and retraining points according to the p_value of KSWIN
                data_sparse = data,
                data_full = data_full,
                sparsity = 0,
                initial_batch_size = initial_batch_size_in,
                initial_batch_sample = initial_batch_sample,
                samplesize = 1, 
                retrainsize = retrainsize,
                distance_measure = 'euclidean',
                clf = None,
                al_percentage = 0, #KSWIN is only tested with full label availability
                uncertainty_threshold = 0.5,
                true_drift_points = true_drifts,
                err_based = True, # 
                
                multiclass = True, 
                amount_classes = 6, 
                approach = 1, 
                real_world = True,
                sampling = '-'
            )
data_name=datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")[:-3]
pickle.dump(detector_res, open("../Results/Detector_objs/kswin_objects/{0}.pickle".format(data_name), "wb" ) )

start
it 0


ks_2samp: Exact calculation unsuccessful. Switching to mode=asymp.


it 5000
it 10000
Drift, No. of iterations: 10003 Samples:  10003
it 15000
Drift, No. of iterations: 17383 Samples:  17383
it 20000
it 25000
Drift, No. of iterations: 25351 Samples:  25351
it 30000
Drift, No. of iterations: 30535 Samples:  30535
it 35000
Drift, No. of iterations: 36088 Samples:  36088
it 40000
Drift, No. of iterations: 44207 Samples:  44207
it 45000
it 50000
Drift, No. of iterations: 50018 Samples:  50018
