# UCDD on weather evaluation

## Accept and preprocess the weather dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv("../Datasets_concept_drift/real_world_data/weather_dataset.csv")

X = df.drop(columns=['Unnamed: 0', 'Label_Rain'])
y = df["Label_Rain"]

X_ref = X[:6053]
X_test = X[6053:]
y_ref = y[:6053]
y_test = y[6053:]

X_ref = X_ref.to_numpy()
X_test = X_test.to_numpy()
y_ref = y_ref.to_numpy().reshape((len(y_ref.index), 1))
y_test = y_test.to_numpy().reshape((len(y_test.index), 1))

scaler = MinMaxScaler()
scaler.fit(X_ref)
X_ref = scaler.transform(X_ref)
X_test = scaler.transform(X_test)

print('\nreference data')
print(X_ref.shape)
print(X_ref)
print('\nreference labels')
print(y_ref.shape)
print('\ntesting data')
print(X_test.shape)
print(X_test)
print('\ntesting labels')
print(y_test.shape)


reference data
(6053, 8)
[[0.32335329 0.45853659 0.50909091 ... 0.21443737 0.00335913 0.36567926]
 [0.34231537 0.44682927 0.53787879 ... 0.34394904 0.00326915 0.35648621]
 [0.31636727 0.41170732 0.58181818 ... 0.25690021 0.00315918 0.35648621]
 ...
 [0.83532934 0.9395122  0.56363636 ... 0.16985138 0.00897767 0.89785495]
 [0.81936128 0.88       0.58787879 ... 0.12951168 0.0084678  0.84576098]
 [0.81437126 0.88780488 0.56363636 ... 0.10828025 0.00836782 0.84576098]]

reference labels
(6053, 1)

testing data
(12106, 8)
[[0.80538922 0.84       0.57272727 ... 0.16985138 0.00855777 0.7854954 ]
 [0.82035928 0.85853659 0.56060606 ... 0.14861996 0.00936756 0.7854954 ]
 [0.88922156 0.87804878 0.49242424 ... 0.1910828  0.00965749 0.86823289]
 ...
 [0.21556886 0.28195122 0.44545455 ... 0.33970276 0.00278927 0.21552605]
 [0.1497006  0.17756098 0.51363636 ... 0.48832272 0.00258933 0.13483146]
 [0.05588822 0.0995122  0.68484848 ... 0.21019108 0.00137964 0.05209397]]

testing labels
(12106, 1)


## Split the dataset to batches

### Yearly batches (size=365)

In [3]:
(12106 - (12106 % 365)) // 365

33

In [4]:
from eval_helpers import helpers

X_test_batches_year, y_test_batches_year = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=365)
X_ref_batches_year, y_ref_batches_year = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=365)

print('# yearly test batches:')
print(len(X_test_batches_year))
print(len(y_test_batches_year))
print('# yearly ref batches:')
print(len(X_ref_batches_year))
print(len(y_ref_batches_year))

[[0.80538922 0.84       0.57272727 ... 0.16985138 0.00855777 0.7854954 ]
 [0.82035928 0.85853659 0.56060606 ... 0.14861996 0.00936756 0.7854954 ]
 [0.88922156 0.87804878 0.49242424 ... 0.1910828  0.00965749 0.86823289]
 ...
 [0.78043912 0.83804878 0.61969697 ... 0.14861996 0.00814788 0.80183861]
 [0.77844311 0.87219512 0.61363636 ... 0.19532909 0.00850779 0.80183861]
 [0.81736527 0.88780488 0.58333333 ... 0.492569   0.00868774 0.82022472]]
(365, 8)
[[0.32335329 0.45853659 0.50909091 ... 0.21443737 0.00335913 0.36567926]
 [0.34231537 0.44682927 0.53787879 ... 0.34394904 0.00326915 0.35648621]
 [0.31636727 0.41170732 0.58181818 ... 0.25690021 0.00315918 0.35648621]
 ...
 [0.38123752 0.45463415 0.41212121 ... 0.34394904 0.00396897 0.37691522]
 [0.25149701 0.31707317 0.66666667 ... 0.29723992 0.00436886 0.21348315]
 [0.41217565 0.4702439  0.56818182 ... 0.17409766 0.00585848 0.35648621]]
(365, 8)
# yearly test batches:
33
33
# yearly ref batches:
16
16


### Monthly batches (size=30)

In [5]:
X_test_batches_month, y_test_batches_month = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=30)
X_ref_batches_month, y_ref_batches_month = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=30)

print('# monthly test batches:')
print(len(X_test_batches_month))
print(len(y_test_batches_month))
print('# monthly ref batches:')
print(len(X_ref_batches_month))
print(len(y_ref_batches_month))

[[0.80538922 0.84       0.57272727 0.63404255 0.14285714 0.16985138
  0.00855777 0.7854954 ]
 [0.82035928 0.85853659 0.56060606 0.65106383 0.11278195 0.14861996
  0.00936756 0.7854954 ]
 [0.88922156 0.87804878 0.49242424 0.62978723 0.16165414 0.1910828
  0.00965749 0.86823289]
 [0.8992016  0.92682927 0.4530303  0.6212766  0.21052632 0.23142251
  0.00987743 0.87742594]
 [0.94111776 0.94829268 0.42272727 0.62978723 0.2556391  0.21019108
  0.01026733 0.92849847]
 [0.93013972 0.95219512 0.44090909 0.62978723 0.22180451 0.16985138
  0.00987743 0.94892748]
 [0.85129741 0.96878049 0.53181818 0.65957447 0.15413534 0.25690021
  0.00926759 0.8886619 ]
 [0.84431138 0.96585366 0.57878788 0.50638298 0.16541353 0.10828025
  0.0084678  0.8886619 ]
 [0.84231537 0.95317073 0.57575758 0.22978723 0.12030075 0.08280255
  0.00866775 0.89785495]
 [0.84231537 0.89170732 0.56515152 0.4893617  0.13909774 0.23142251
  0.00866775 0.85699694]
 [0.73752495 0.82926829 0.58636364 0.68085106 0.07894737 0.14861996
  0

## Imports

In [9]:
import importlib
from eval_helpers import ucdd_eval_real_world
from eval_helpers import kmeans_verbose_helpers
from core import ucdd_supported_parameters as spms
importlib.reload(ucdd_eval_real_world)
from core import ucdd
importlib.reload(ucdd)

something


<module 'core.ucdd' from 'C:\\Users\\jpohl\\PycharmProjects\\clustering-drift-detection\\ucdd_improved\\core\\ucdd.py'>

## Yearly evaluation

### Find suitable KMeans clustering parameters

In [10]:
kmeans_verbose_helpers.write_kmeans_results_ucdd_helper('weather_kmeans_year/weather_output_year', X_ref_batches_year,
                                                        n_init=100, max_iter=500, tol=0,
                                                        random_state=1053)

filename weather_kmeans_year/weather_output_year0.txt
random state: 1053
filename weather_kmeans_year/weather_output_year1.txt
random state: 1053
filename weather_kmeans_year/weather_output_year2.txt
random state: 1053
filename weather_kmeans_year/weather_output_year3.txt
random state: 1053
filename weather_kmeans_year/weather_output_year4.txt
random state: 1053
filename weather_kmeans_year/weather_output_year5.txt
random state: 1053
filename weather_kmeans_year/weather_output_year6.txt
random state: 1053
filename weather_kmeans_year/weather_output_year7.txt
random state: 1053
filename weather_kmeans_year/weather_output_year8.txt
random state: 1053
filename weather_kmeans_year/weather_output_year9.txt
random state: 1053
filename weather_kmeans_year/weather_output_year10.txt
random state: 1053
filename weather_kmeans_year/weather_output_year11.txt
random state: 1053
filename weather_kmeans_year/weather_output_year12.txt
random state: 1053
filename weather_kmeans_year/weather_output_year

### Use UCDD directly to obtain inspectable results

In [39]:
from core import ucdd
import importlib
importlib.reload(ucdd)

all_2d_drifts_year = []
all_2d_cluster_classif_accs_year = []

for random_state in [0, 100]:
    drifts_2d_arr_year, cluster_classif_accs_2d_arr_year = ucdd.all_drifting_batches_parallel_all_info(
        X_ref_batches_year,
        X_test_batches_year,
        additional_check=True,
        n_init=100,
        max_iter=13000,
        tol=0,
        random_state=random_state,
        reference_label_batches=y_ref_batches_year,
        testing_label_batches=y_test_batches_year
    )
    all_2d_drifts_year.append(drifts_2d_arr_year)
    all_2d_cluster_classif_accs_year.append(cluster_classif_accs_2d_arr_year)

result_dict_year = {
    'all_2d_drifts_year': all_2d_drifts_year,
    'all_2d_cluster_classif_accs_year': all_2d_cluster_classif_accs_year
}
np.save('weather_stats_year.npy', result_dict_year)

random_state
0
pool opened
random_state
100
pool opened


In [40]:
result_dict_year = np.load('weather_stats_year.npy', allow_pickle=True).item()
all_2d_drifts_year = result_dict_year['all_2d_drifts_year']
print('number of drift detection differences in the two runs:')
num_diffs = np.sum(all_2d_drifts_year[0] != all_2d_drifts_year[1])
print(num_diffs)

number of drift detection differences in the two runs:
2


### Read the defined yearly drift locations

In [10]:
# import csv
#
# true_drift_bool_year = []
# with open('../Datasets_concept_drift/real_world_data_drifts/weather/weather_yearly_drifts.csv') as f:
#     rdr = csv.reader(f, delimiter=',')
#     for row in rdr:
#         true_drift_bool_year.append(row)
#     true_drift_bool_year = true_drift_bool_year[0] # only one row of important data
#     true_drift_bool_year = [b == 'True' for b in true_drift_bool_year]
# print(true_drift_bool_year)

[False, True, True, True, False, True, False, False, False, False, True, True, False, False, True, True, True, True, False, False, False, True, False, True, True, False, True, True, True, False, False, True, True]


### Use the drift locations and best clustering parameters for the evaluation

In [14]:
# import time
# import importlib
# importlib.reload(ucdd_eval_real_world)
# importlib.reload(ucdd)
#
#
# # start_regular = time.time()
# # _, fpr_mean, _, det_acc_mean, _ = ucdd_eval_real_world.all_drifting_batches_randomness_robust(
# #     reference_data_batches=X_ref_batches_year,
# #     testing_data_batches=X_test_batches_year,
# #     true_drift_bool=true_drift_bool_year,
# #     min_ref_batches_drift=0.3,
# #     additional_check=True,
# #     n_init=100, max_iter=13000, tol=0,
# #     first_random_state=0,
# #     min_runs=2, std_err_threshold=0.05,
# #     parallel=False
# # )
# #
# # print('yearly mean FPR:', fpr_mean)
# # print('yearly mean detection accuracy:', det_acc_mean)
# # print('time(sec):', time.time() - start_regular)
#
# start_parallel = time.time()
# _, fpr_mean, _, det_acc_mean, _ = ucdd_eval_real_world.all_drifting_batches_randomness_robust(
#     reference_data_batches=X_ref_batches_year,
#     testing_data_batches=X_test_batches_year,
#     true_drift_bool=true_drift_bool_year,
#     min_ref_batches_drift=0.3,
#     additional_check=True,
#     n_init=100, max_iter=13000, tol=0,
#     first_random_state=0,
#     min_runs=2, std_err_threshold=0.05,
#     parallel=True
# )
#
# print('yearly mean FPR:', fpr_mean)
# print('yearly mean detection accuracy:', det_acc_mean)
# print('time(sec):', time.time() - start_parallel)

NameError: name 'true_drift_bool_year' is not defined

## Monthly evaluation

### (Choose a subset of reference batches)

### Find suitable KMeans clustering parameters

In [11]:
kmeans_verbose_helpers.write_kmeans_results_ucdd_helper('weather_kmeans_month/weather_output_month', X_ref_batches_month,
                                                        n_init=100, max_iter=500, tol=0,
                                                        random_state=1053)

filename weather_kmeans_month/weather_output_month0.txt
random state: 1053
filename weather_kmeans_month/weather_output_month1.txt
random state: 1053
filename weather_kmeans_month/weather_output_month2.txt
random state: 1053
filename weather_kmeans_month/weather_output_month3.txt
random state: 1053
filename weather_kmeans_month/weather_output_month4.txt
random state: 1053
filename weather_kmeans_month/weather_output_month5.txt
random state: 1053
filename weather_kmeans_month/weather_output_month6.txt
random state: 1053
filename weather_kmeans_month/weather_output_month7.txt
random state: 1053
filename weather_kmeans_month/weather_output_month8.txt
random state: 1053
filename weather_kmeans_month/weather_output_month9.txt
random state: 1053
filename weather_kmeans_month/weather_output_month10.txt
random state: 1053
filename weather_kmeans_month/weather_output_month11.txt
random state: 1053
filename weather_kmeans_month/weather_output_month12.txt
random state: 1053
filename weather_kmean

### Use UCDD directly to obtain inspectable results

In [6]:
from core import ucdd
import importlib
importlib.reload(ucdd)

all_2d_drifts_month = []
all_2d_cluster_classif_accs_month = []

random_state=0
drifts_2d_arr_month, cluster_classif_accs_2d_arr_month = ucdd.all_drifting_batches_parallel_all_info(
    # X_ref_batches_month,
    X_ref_batches_month[-3:],
    X_test_batches_month,
    additional_check=True,
    # n_init=100,
    n_init=1,
    # max_iter=18000,
    max_iter=10,
    tol=0,
    random_state=random_state,
    reference_label_batches=y_ref_batches_month[-3:],
    testing_label_batches=y_test_batches_month
)
all_2d_drifts_month.append(drifts_2d_arr_month)
all_2d_cluster_classif_accs_month.append(cluster_classif_accs_2d_arr_month)

result_dict_month = {
    'all_2d_drifts_month': all_2d_drifts_month,
    'all_2d_cluster_classif_accs_month': all_2d_cluster_classif_accs_month
}
np.save('weather_stats_month.npy', result_dict_month)

random_state
0
pool opened


In [7]:
result_dict_month = np.load('weather_stats_month.npy', allow_pickle=True).item()
print('resulting dictionary')
print(result_dict_month)

resulting dictionary
{'all_2d_drifts_month': [array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False,  True,  True],
       [False, False, False, ..., False,  True,  True]])], 'all_2d_cluster_classif_accs_month': [array([[0.51666667, 0.7       , 0.7       , ..., 0.51666667, 0.7       ,
        0.53333333],
       [0.53333333, 0.75      , 0.75      , ..., 0.76666667, 0.71666667,
        0.65      ],
       [0.6       , 0.71666667, 0.61666667, ..., 0.71666667, 0.55      ,
        0.53333333]])]}


### Read the defined monthly drift locations

In [11]:
# true_drift_bool_month = []
# with open('../Datasets_concept_drift/real_world_data_drifts/weather/weather_monthly_drifts.csv') as f:
#     rdr = csv.reader(f, delimiter=',')
#     for row in rdr:
#         true_drift_bool_month.append(row)
#     true_drift_bool_month = true_drift_bool_month[0] # only one row of important data
#     true_drift_bool_month = [b == 'True' for b in true_drift_bool_month]
# print(true_drift_bool_month)

[False, False, False, False, False, False, False, False, False, False, True, True, False, True, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, True, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, Fal

### Use the drift locations and best clustering parameters for the evaluation

In [12]:
# import time
#
#
# start_time = time.time()
# _, fpr_mean_month, _, det_acc_mean_month, _ = ucdd_eval_real_world.all_drifting_batches_randomness_robust(
#     reference_data_batches=X_ref_batches_month,
#     testing_data_batches=X_test_batches_month,
#     true_drift_bool=true_drift_bool_month,
#     min_ref_batches_drift=0.3,
#     additional_check=True,
#     n_init=100, max_iter=18000, tol=0,
#     first_random_state=0,
#     min_runs=2, std_err_threshold=0.05
# )
# end_time = time.time()
# print('time taken:', end_time - start_time)
# print('monthly mean FPR:', fpr_mean_month)
# print('monthly mean detection accuracy:', det_acc_mean_month)

min_runs 2
random_state
0
[True, False, False, False, False, False, True, False, False, False, True, True, True, True, False, False, False, True, False, False, False, True, True, True, True, True, False, False, False, False, False, False, False, False, False, True, True, False, False, True, False, False, False, False, False, False, True, True, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, True, False, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, False, False, False, False, False, False, False, True, True, False, False, False, False, False, False, False, False, False, False, True, True, True, False, False, False, True, True, False, False, False, True, True, True, False, False, False, False, False, False, False, False, False, True, True, True, True, False, False, False, True, False, False, False, False, True, True, True, False,