# UCDD on weather evaluation

## Accept and preprocess the weather dataset

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.preprocessing import MinMaxScaler


df = pd.read_csv("../Datasets_concept_drift/real_world_data/weather_dataset.csv")

X = df.drop(columns=['Unnamed: 0', 'Label_Rain'])
y = df["Label_Rain"]

X_ref = X[:6053]
X_test = X[6053:]
y_ref = y[:6053]
y_test = y[6053:]

X_ref = X_ref.to_numpy()
X_test = X_test.to_numpy()
y_ref = y_ref.to_numpy()
y_test = y_test.to_numpy()

scaler = MinMaxScaler()
scaler.fit(X_ref)
X_ref = scaler.transform(X_ref)
X_test = scaler.transform(X_test)

print('\nreference data')
print(X_ref.shape)
print(X_ref)
print('\nreference labels')
print(y_ref.shape)
print('\ntesting data')
print(X_test.shape)
print(X_test)
print('\ntesting labels')
print(y_test.shape)


reference data
(6053, 8)
[[0.32335329 0.45853659 0.50909091 ... 0.21443737 0.00335913 0.36567926]
 [0.34231537 0.44682927 0.53787879 ... 0.34394904 0.00326915 0.35648621]
 [0.31636727 0.41170732 0.58181818 ... 0.25690021 0.00315918 0.35648621]
 ...
 [0.83532934 0.9395122  0.56363636 ... 0.16985138 0.00897767 0.89785495]
 [0.81936128 0.88       0.58787879 ... 0.12951168 0.0084678  0.84576098]
 [0.81437126 0.88780488 0.56363636 ... 0.10828025 0.00836782 0.84576098]]

reference labels
(6053,)

testing data
(12106, 8)
[[0.80538922 0.84       0.57272727 ... 0.16985138 0.00855777 0.7854954 ]
 [0.82035928 0.85853659 0.56060606 ... 0.14861996 0.00936756 0.7854954 ]
 [0.88922156 0.87804878 0.49242424 ... 0.1910828  0.00965749 0.86823289]
 ...
 [0.21556886 0.28195122 0.44545455 ... 0.33970276 0.00278927 0.21552605]
 [0.1497006  0.17756098 0.51363636 ... 0.48832272 0.00258933 0.13483146]
 [0.05588822 0.0995122  0.68484848 ... 0.21019108 0.00137964 0.05209397]]

testing labels
(12106,)


## Split the dataset to batches

### Yearly batches (size=365)

In [29]:
(12106 - (12106 % 365)) // 365

33

In [30]:
from eval_helpers import helpers
importlib.reload(helpers)

X_test_batches_year, y_test_batches_year = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=365)
X_ref_batches_year, y_ref_batches_year = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=365)

print('# yearly test batches:')
print(len(X_test_batches_year))
print(len(y_test_batches_year))
print('# yearly ref batches:')
print(len(X_ref_batches_year))
print(len(y_ref_batches_year))

chunk size 365
number of chunks 33
number of data 12106
number of resulting batches 33
[[0.80538922 0.84       0.57272727 ... 0.16985138 0.00855777 0.7854954 ]
 [0.82035928 0.85853659 0.56060606 ... 0.14861996 0.00936756 0.7854954 ]
 [0.88922156 0.87804878 0.49242424 ... 0.1910828  0.00965749 0.86823289]
 ...
 [0.81736527 0.88780488 0.58333333 ... 0.492569   0.00868774 0.82022472]
 [0.83433134 0.89658537 0.57121212 ... 0.23566879 0.00886769 0.85699694]
 [0.78642715 0.83121951 0.58787879 ... 0.16985138 0.00850779 0.82022472]]
(367, 8)
(367, 8)
(367, 8)
chunk size 365
number of chunks 33
number of data 6053
number of resulting batches 33
[[0.32335329 0.45853659 0.50909091 ... 0.21443737 0.00335913 0.36567926]
 [0.34231537 0.44682927 0.53787879 ... 0.34394904 0.00326915 0.35648621]
 [0.31636727 0.41170732 0.58181818 ... 0.25690021 0.00315918 0.35648621]
 ...
 [0.9001996  0.92097561 0.44848485 ... 0.38428875 0.01007738 0.90806946]
 [0.87824351 0.88195122 0.41515152 ... 0.59872611 0.0095675

### Monthly batches (size=30)

In [5]:
X_test_batches_month, y_test_batches_month = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=30)
X_ref_batches_month, y_ref_batches_month = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=30)

print('# monthly test batches:')
print(len(X_test_batches_month))
print(len(y_test_batches_month))
print('# monthly ref batches:')
print(len(X_ref_batches_month))
print(len(y_ref_batches_month))

chunk size 30
number of chunks 404
number of data 12106
number of resulting batches 404
[[0.80538922 0.84       0.57272727 0.63404255 0.14285714 0.16985138
  0.00855777 0.7854954 ]
 [0.82035928 0.85853659 0.56060606 0.65106383 0.11278195 0.14861996
  0.00936756 0.7854954 ]
 [0.88922156 0.87804878 0.49242424 0.62978723 0.16165414 0.1910828
  0.00965749 0.86823289]
 [0.8992016  0.92682927 0.4530303  0.6212766  0.21052632 0.23142251
  0.00987743 0.87742594]
 [0.94111776 0.94829268 0.42272727 0.62978723 0.2556391  0.21019108
  0.01026733 0.92849847]
 [0.93013972 0.95219512 0.44090909 0.62978723 0.22180451 0.16985138
  0.00987743 0.94892748]
 [0.85129741 0.96878049 0.53181818 0.65957447 0.15413534 0.25690021
  0.00926759 0.8886619 ]
 [0.84431138 0.96585366 0.57878788 0.50638298 0.16541353 0.10828025
  0.0084678  0.8886619 ]
 [0.84231537 0.95317073 0.57575758 0.22978723 0.12030075 0.08280255
  0.00866775 0.89785495]
 [0.84231537 0.89170732 0.56515152 0.4893617  0.13909774 0.23142251
  0.0086

## Helper function

In [7]:
from eval_helpers import kmeans_verbose_helpers


def write_kmeans_results_ucdd_helper(output_filename_no_extension, ref_batches, n_init, max_iter, tol, random_state):
    # dummy = [np.asarray(1), np.asarray(2), np.asarray(3)]
    combinations = []
    for i in range(len(ref_batches)):
    #     combinations.append(np.vstack((dummy[i], dummy[(i + 1) % 3])))
        combinations.append(np.vstack((ref_batches[i], ref_batches[(i + 1) % 3])))
        
    all_results_from_combinations = []
    for i, combination in enumerate(combinations):
        filename = output_filename_no_extension + str(i) + '.txt'
        print('filename', filename)
        kmeans_verbose_helpers.write_verbose_kmeans_to_file(result_filename=output_filename_no_extension + str(i) + '.txt',
                                     data_to_cluster=combination,
                                     n_clusters=2, n_init=n_init, max_iter=max_iter, tol=tol, random_state=random_state)
        output_dicts = kmeans_verbose_helpers.convert_kmeans_output_file_to_dicts(filename, n_init=n_init)
        all_results_from_combinations.append(output_dicts)
        kmeans_verbose_helpers.print_stats_from_kmeans_output_dicts(output_dicts)
        
    kmeans_verbose_helpers.print_stats_from_all_combinations(all_results_from_combinations)

## Yearly evaluation

### Find suitable KMeans clustering parameters

In [8]:
write_kmeans_results_ucdd_helper('weather_output_year', X_ref_batches_year, n_init=100, max_iter=500, tol=0,
                                 random_state=1053)

filename weather_output_year0.txt
random state: 1053
total number of results: 100
maximum number of iterations: 9
minimum initial inertia: 72.6044230461005
maximum initial inertia: 214.77188559069705
number of unique final inertia values: 2
minimum final inertia: 62.88207943842754
maximum final inertia: 62.882083211285156
total number of convergences: 100
number of strict convergences: 100
number of tol-based convergences: 0
filename weather_output_year1.txt
random state: 1053
total number of results: 100
maximum number of iterations: 9
minimum initial inertia: 69.89183949344003
maximum initial inertia: 180.90670663035934
number of unique final inertia values: 1
minimum final inertia: 58.44487759109137
maximum final inertia: 58.44487759109137
total number of convergences: 100
number of strict convergences: 100
number of tol-based convergences: 0
filename weather_output_year2.txt
random state: 1053
total number of results: 100
maximum number of iterations: 11
minimum initial inertia: 84

### Read the defined yearly drift locations

In [9]:
import csv

true_drift_bool_year = []
with open('../Datasets_concept_drift/real_world_data_drifts/weather/weather_yearly_drifts.csv') as f:
    rdr = csv.reader(f, delimiter=',')
    for row in rdr:
        true_drift_bool_year.append(row)
    true_drift_bool_year = true_drift_bool_year[0] # only one row of important data
    true_drift_bool_year = [b == 'True' for b in true_drift_bool_year]
print(true_drift_bool_year)

[False, True, True, False, False, True, False, False, False, False, False, False, True, False, True, True, True, True, True, False, True, False, True, True, False, False, False, True, False, False, False, False, True, True]


### Use the drift locations and best clustering parameters for the evaluation

In [18]:
import importlib
from eval_helpers import ucdd_eval_real_world
importlib.reload(ucdd_eval_real_world)
from core import ucdd
importlib.reload(ucdd)

_, fpr_mean, _, det_acc_mean, _ = ucdd_eval_real_world.all_drifting_batches_randomness_robust(
    reference_data_batches=X_ref_batches_year,
    testing_data_batches=X_test_batches_year,
    true_drift_bool=true_drift_bool_year,
    min_ref_batches_drift=0.3,
    additional_check=True,
    n_init=100, max_iter=90, tol=0,
    first_random_state=0,
    min_runs=2, std_err_threshold=0.05
)

print('yearly mean FPR:', fpr_mean)
print('yearly mean detection accuracy:', det_acc_mean)

min_runs 2
random_state
0
BEFORE SPLITTING TO CLUSTERS
n_init 100 max_iter 90 tol 0
CLUSTERS - numpy shapes:
ref_plus (0, 8)
ref_minus (357, 8)
test_plus (1, 8)
test_minus (356, 8)


ValueError: Found array with 0 sample(s) (shape=(0, 8)) while a minimum of 1 is required by NearestNeighbors.