# UCDD on ELECT2 evaluation

## Accept and preprocess the ELECT2 dataset

In [2]:
import csv

import numpy as np
import pandas as pd
import seaborn as sb

In [13]:
from eval_helpers import accepting

df = pd.read_csv("../Datasets_concept_drift/real_world_data/electricity_dataset.csv")

X = df.drop(columns=['Unnamed: 0', 'label', 'real_date'])
y = df["label"]

X['day'] = X['day'].apply(lambda x: float(x.split('\'')[1]))

print(X.head())
print('DTYPES')
print(X.dtypes)
# X = float(X)
# print(X.head())

print('# nan values in X', np.count_nonzero(np.isnan(X)))

X_ref = X[:15104]
X_test = X[15104:]
y_ref = y[:15104]
y_test = y[15104:]

X_ref = X_ref.to_numpy()
X_test = X_test.to_numpy()
y_ref = y_ref.to_numpy().reshape((len(y_ref.index), 1))
y_test = y_test.to_numpy().reshape((len(y_test.index), 1))

print('\nreference data')
print(X_ref.shape)
print(X_ref)
print('\nreference labels')
print(y_ref.shape)
print('\ntesting data')
print(X_test.shape)
print(X_test)
print('\ntesting labels')
print(y_test.shape)

   date  day    period  nswprice  nswdemand  vicprice  vicdemand  transfer
0   0.0  2.0  0.000000  0.056443   0.439155  0.003467   0.422915  0.414912
1   0.0  2.0  0.021277  0.051699   0.415055  0.003467   0.422915  0.414912
2   0.0  2.0  0.042553  0.051489   0.385004  0.003467   0.422915  0.414912
3   0.0  2.0  0.063830  0.045485   0.314639  0.003467   0.422915  0.414912
4   0.0  2.0  0.085106  0.042482   0.251116  0.003467   0.422915  0.414912
DTYPES
date         float64
day          float64
period       float64
nswprice     float64
nswdemand    float64
vicprice     float64
vicdemand    float64
transfer     float64
dtype: object
# nan values in X 0

reference data
(15104, 8)
[[0.       2.       0.       ... 0.003467 0.422915 0.414912]
 [0.       2.       0.021277 ... 0.003467 0.422915 0.414912]
 [0.       2.       0.042553 ... 0.003467 0.422915 0.414912]
 ...
 [0.434052 1.       0.617021 ... 0.003467 0.422915 0.414912]
 [0.434052 1.       0.638298 ... 0.003467 0.422915 0.414912]
 [0.

## Split the dataset to batches (size=365)

In [14]:
from eval_helpers import helpers

X_test_batches_year, y_test_batches_year = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=365)
X_ref_batches_year, y_ref_batches_year = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=365)

print('# yearly test batches:')
print(len(X_test_batches_year))
print(len(y_test_batches_year))
print('# yearly ref batches:')
print(len(X_ref_batches_year))
print(len(y_ref_batches_year))

[[0.434052 1.       0.680851 ... 0.003467 0.422915 0.414912]
 [0.434052 1.       0.702128 ... 0.003467 0.422915 0.414912]
 [0.434052 1.       0.723404 ... 0.003467 0.422915 0.414912]
 ...
 [0.434406 2.       0.212766 ... 0.003467 0.422915 0.414912]
 [0.434406 2.       0.234043 ... 0.003467 0.422915 0.414912]
 [0.434406 2.       0.255319 ... 0.003467 0.422915 0.414912]]
(365, 8)
[[0.00000e+00 2.00000e+00 0.00000e+00 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 [0.00000e+00 2.00000e+00 2.12770e-02 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 [0.00000e+00 2.00000e+00 4.25530e-02 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 ...
 [3.10000e-04 2.00000e+00 5.53191e-01 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 [3.10000e-04 2.00000e+00 5.74468e-01 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 [3.10000e-04 2.00000e+00 5.95745e-01 ... 3.46700e-03 4.22915e-01
  4.14912e-01]]
(365, 8)
# yearly test batches:
82
82
# yearly ref batches:
41
41


## Imports

In [15]:
import importlib
from eval_helpers import ucdd_eval_real_world
from eval_helpers import kmeans_verbose_helpers
from core import ucdd_supported_parameters as spms
from core import ucdd
importlib.reload(ucdd_eval_real_world)
importlib.reload(ucdd)
importlib.reload(kmeans_verbose_helpers)

something


<module 'eval_helpers.kmeans_verbose_helpers' from 'C:\\Users\\jpohl\\PycharmProjects\\clustering-drift-detection\\ucdd_improved\\eval_helpers\\kmeans_verbose_helpers.py'>

## Find suitable KMeans clustering parameters

In [16]:
kmeans_verbose_helpers.write_kmeans_results_ucdd_helper('elect2_kmeans_year/elect2_kmeans_year_output',
                                                        X_ref_batches_year,
                                                        n_init=100, max_iter=500, tol=0, random_state=1053)

filename elect2_kmeans_year/elect2_kmeans_year_output0.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output1.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output2.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output3.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output4.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output5.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output6.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output7.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output8.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output9.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output10.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output11.txt
random state: 1053
filename elect2_kmeans_year/elect2_kmeans_year_output12.txt
ra

### Use UCDD directly to obtain inspectable results

In [17]:
from core import ucdd
import importlib
importlib.reload(ucdd)

all_2d_drifts_year = []
all_2d_cluster_classif_accs_year = []

for random_state in [0, 100]:
    drifts_2d_arr_year, cluster_classif_accs_2d_arr_year = ucdd.all_drifting_batches_parallel_all_info(
        X_ref_batches_year,
        X_test_batches_year,
        additional_check=True,
        n_init=100,
        max_iter=5000,
        tol=0,
        random_state=random_state,
        reference_label_batches=y_ref_batches_year,
        testing_label_batches=y_test_batches_year
    )
    all_2d_drifts_year.append(drifts_2d_arr_year)
    all_2d_cluster_classif_accs_year.append(cluster_classif_accs_2d_arr_year)

result_dict_year = {
    'all_2d_drifts_year': all_2d_drifts_year,
    'all_2d_cluster_classif_accs_year': all_2d_cluster_classif_accs_year
}
np.save('elect2_stats_year.npy', result_dict_year)

random_state
0
pool opened
random_state
100
pool opened


In [19]:
result_dict_year = np.load('elect2_stats_year.npy', allow_pickle=True).item()
print('resulting dictionary')
print(result_dict_year)
all_2d_drifts_year = result_dict_year['all_2d_drifts_year']
print('total number of drift detections:', len(X_ref_batches_year) * len(X_test_batches_year))
print('number of drift detection differences in the two runs:')
num_diffs = np.sum(all_2d_drifts_year[0] != all_2d_drifts_year[1])
print(num_diffs)

resulting dictionary
{'all_2d_drifts_year': [array([[ True, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [ True,  True, False, ...,  True,  True,  True],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True]]), array([[ True, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [ True,  True, False, ...,  True,  True,  True],
       ...,
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True],
       [False, False, False, ...,  True,  True,  True]])], 'all_2d_cluster_classif_accs_year': [array([[0.53424658, 0.50136986, 0.54109589, ..., 0.5630137 , 0.54109589,
        0.56986301],
       [0.57260274, 0.53972603, 0.54109589, ..., 0.5630137 , 0.57945205,
        0.60821918],
       [0.52739726, 0.5356164