# MSSW on ELECT2 evaluation

## Accept and preprocess the ELECT2 dataset

In [1]:
import csv

import numpy as np
import pandas as pd
import seaborn as sb

In [27]:
from eval_helpers import accepting

df = pd.read_csv("../Datasets_concept_drift/real_world_data/electricity_dataset.csv")

X = df.drop(columns=['Unnamed: 0', 'label', 'real_date'])
y = df["label"]

X['day'] = X['day'].apply(lambda x: float(x.split('\'')[1]))

print(X.head())
print('DTYPES')
print(X.dtypes)
# X = float(X)
# print(X.head())

print('# nan values in X', np.count_nonzero(np.isnan(X)))

X_ref = X[:15104]
X_test = X[15104:]
y_ref = y[:15104]
y_test = y[15104:]

X_ref = X_ref.to_numpy()
X_test = X_test.to_numpy()
y_ref = y_ref.to_numpy()
y_test = y_test.to_numpy()

print('\nreference data')
print(X_ref.shape)
print(X_ref)
print('\nreference labels')
print(y_ref.shape)
print('\ntesting data')
print(X_test.shape)
print(X_test)
print('\ntesting labels')
print(y_test.shape)

   date  day    period  nswprice  nswdemand  vicprice  vicdemand  transfer
0   0.0  2.0  0.000000  0.056443   0.439155  0.003467   0.422915  0.414912
1   0.0  2.0  0.021277  0.051699   0.415055  0.003467   0.422915  0.414912
2   0.0  2.0  0.042553  0.051489   0.385004  0.003467   0.422915  0.414912
3   0.0  2.0  0.063830  0.045485   0.314639  0.003467   0.422915  0.414912
4   0.0  2.0  0.085106  0.042482   0.251116  0.003467   0.422915  0.414912
DTYPES
date         float64
day          float64
period       float64
nswprice     float64
nswdemand    float64
vicprice     float64
vicdemand    float64
transfer     float64
dtype: object
# nan values in X 0

reference data
(15104, 8)
[[0.       2.       0.       ... 0.003467 0.422915 0.414912]
 [0.       2.       0.021277 ... 0.003467 0.422915 0.414912]
 [0.       2.       0.042553 ... 0.003467 0.422915 0.414912]
 ...
 [0.434052 1.       0.617021 ... 0.003467 0.422915 0.414912]
 [0.434052 1.       0.638298 ... 0.003467 0.422915 0.414912]
 [0.

## Split the dataset to batches (size=365)

In [28]:
from eval_helpers import helpers

X_test_batches_year, y_test_batches_year = helpers.split_to_fixed_size_batches(X_test, y_test, batch_size=365)
X_ref_batches_year, y_ref_batches_year = helpers.split_to_fixed_size_batches(X_ref, y_ref, batch_size=365)

print('# yearly test batches:')
print(len(X_test_batches_year))
print(len(y_test_batches_year))
print('# yearly ref batches:')
print(len(X_ref_batches_year))
print(len(y_ref_batches_year))

chunk size 365
number of chunks 82
number of data 30208
shape of X that can be split perfectly (29930, 8)
number of resulting batches 82
[[0.434052 1.       0.680851 ... 0.003467 0.422915 0.414912]
 [0.434052 1.       0.702128 ... 0.003467 0.422915 0.414912]
 [0.434052 1.       0.723404 ... 0.003467 0.422915 0.414912]
 ...
 [0.434406 2.       0.212766 ... 0.003467 0.422915 0.414912]
 [0.434406 2.       0.234043 ... 0.003467 0.422915 0.414912]
 [0.434406 2.       0.255319 ... 0.003467 0.422915 0.414912]]
(365, 8)
chunk size 365
number of chunks 41
number of data 15104
shape of X that can be split perfectly (14965, 8)
number of resulting batches 41
[[0.00000e+00 2.00000e+00 0.00000e+00 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 [0.00000e+00 2.00000e+00 2.12770e-02 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 [0.00000e+00 2.00000e+00 4.25530e-02 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 ...
 [3.10000e-04 2.00000e+00 5.53191e-01 ... 3.46700e-03 4.22915e-01
  4.14912e-01]
 [3.10000e-04 2.

## Find suitable KMeans clustering parameters

In [29]:
import importlib
from core import mssw_preprocessing
from eval_helpers import kmeans_verbose_helpers
importlib.reload(mssw_preprocessing)

weighted_joined_reference_data, _, _ = mssw_preprocessing.mssw_preprocess(X_ref_batches_year, X_test_batches_year)

# filename = 'elec_output.txt'
# print(kmeans_verbose_helpers)
# kmeans_verbose_helpers.write_verbose_kmeans_to_file(filename, weighted_joined_reference_data,
#                                                     n_clusters=10, n_init=100, max_iter=500, tol=0, random_state=1053)
# output_dicts = kmeans_verbose_helpers.convert_kmeans_output_file_to_dicts(filename, n_init=100)
# kmeans_verbose_helpers.print_stats_from_kmeans_output_dicts(output_dicts)

first few reference data points after scaling
[[2.22044605e-11 1.66666667e-01 2.22044605e-11 9.33550125e-02
  6.45216941e-01 2.22044605e-11 2.22044605e-11 2.22044605e-11]
 [2.22044605e-11 1.66666667e-01 2.12770000e-02 6.52037171e-02
  6.04968436e-01 2.22044605e-11 2.22044605e-11 2.22044605e-11]
 [2.22044605e-11 1.66666667e-01 4.25530000e-02 6.39575594e-02
  5.54781389e-01 2.22044605e-11 2.22044605e-11 2.22044605e-11]
 [2.22044605e-11 1.66666667e-01 6.38300000e-02 2.83293179e-02
  4.37267444e-01 2.22044605e-11 2.22044605e-11 2.22044605e-11]
 [2.22044605e-11 1.66666667e-01 8.51060000e-02 1.05092631e-02
  3.31180066e-01 2.22044605e-11 2.22044605e-11 2.22044605e-11]
 [2.22044605e-11 1.66666667e-01 1.06383000e-01 2.67033789e-03
  2.58385384e-01 2.22044605e-11 2.22044605e-11 2.22044605e-11]
 [2.22044605e-11 1.66666667e-01 1.27660000e-01 2.67033789e-03
  1.98757474e-01 2.22044605e-11 2.22044605e-11 2.22044605e-11]
 [2.22044605e-11 1.66666667e-01 1.48936000e-01 2.67033789e-03
  1.66956144e-01 

  return weighted_data


## Try different numbers of clusters

In [None]:
from yellowbrick.cluster.elbow import kelbow_visualizer
from yellowbrick.datasets.loaders import load_nfl
from sklearn.cluster import KMeans


kmeans = KMeans(
    n_init=100,
    max_iter=10000,
    tol=0,
    random_state=0
)

kelbow_visualizer(kmeans, weighted_joined_reference_data, k=(2, 30))



