In [13]:
from IPython.core.interactiveshell import InteractiveShell
import pandas as pd
import numpy as np
import os
import random
import tqdm
from typing import Union

import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 12]

InteractiveShell.ast_node_interactivity = "all"

In [2]:
!cd C:\Users\Josef\Google Drive\Uni\Master\3 Wintersemester 20-21\Seminar Information Systems\Contribution

In [3]:
AMPds_PATH = "C:/Users/Josef/Google Drive/Uni/Master/3 Wintersemester 20-21/Seminar Information Systems/Contribution/data/AMPds/"

In [290]:
# TODO: Implement window length

class DataStreamerNilm:
    """Returns batches of a given dataset.

    Takes a given dataset, optionally enriches it with additional data and 
    returns an iterator over that dataset with the given batch size. Note that
    this function applies no preprocessing, so the input data needs to be 
    processed beforehand.
    """

    def __init__(
        self,
        dataset,
        mains_col: str,
        appliance_cols: Union[str, list],
        batch_size: int = 8192,
        window_size: int = 1,
        shuffle: bool = False,
        chunksize: int = -1,
        random_state: int = None
    ):
        """Initialize NILM data streamer.

            Args:
            dataset: pd.DataFrame of mains and appliance data.
              TODO: Load file from disk.
            mains_col: Name of the columns containing the mains readings.
            appliance_col: Either single name or list of appliance names to 
              return.
            batch_size: Number of datapoints returned.
            window_size: In case sequential training data is needed, each 
              batch item consists of a time window with given length. Leave at 
              1 to return independent singular observations.
            shuffle: Shuffle data before yielding. If window length is given,
              the data is first split into window-sized continuous chunks and
              then shuffled to preserve order.
              TODO: How to handle this with window batches.
            chunksize: Currently not implemented. Number of observations to 
              load from disk.
              TODO: If file is loaded from memory, enable chunkwise loading.
            random_state: Use to get reproducable shuffling results.

        Yields:
            An iterable over the input dataset.
        """

        self.mains_col = mains_col
        self.appliance_cols = appliance_cols
        self.batch_size = batch_size
        self.window_size = window_size
        self.shuffle = shuffle
        self.chunksize = chunksize
        self.random_state = random_state
        if self.random_state:
            np.random.seed(self.random_state)
        
        
        # We only need to keep mains + selected appliances in memory
        if type(appliance_cols) is str:
            self.dataset = dataset.filter([mains_col, appliance_cols])
        else:
            self.dataset = dataset.filter([mains_col] + appliance_cols)
        
        self.reset_iterator(self.dataset)
        
    def generate_batch(self):
        target, features = next(self.dataset_iterator)
        return target, features

    def _dataset_to_list(self, data: pd.DataFrame, mains_col: str):
        # Steps:
        # 1 Split into rows or chunks
        # 2 Create numpy arrays with format
        # batch_size x window_length x appliances/data

        mains = data[mains_col].values
        appliances = data.drop(columns=[mains_col])

    def _dataset_iterator(self, data):
        """
        Expects 
        """
        for batch in data:
            yield batch

    def _fill_cache():
        raise NotImplementedError

    def reset_iterator(self, data: pd.DataFrame) -> None:
        """Reset data streamer and empty sample cache"""
        df_length_original, n_cols = data.shape

        if self.window_size > 1:
            # A bit hacky, but to make the reshape work we cut off a small part
            # at the end so the dataset nicely divides into window_sized parts
            cutoff = df_length_original % self.window_size
            if cutoff > 0:
                data = data[:-cutoff]
        df_length = data.shape[0]
        n_splits = df_length // self.window_size

        # Reshape the data into window_sized parts
        data = data.to_numpy().reshape((n_splits, self.window_size, n_cols))

        if self.shuffle:
            np.random.shuffle(data)
        
        # There might be a better way to make sure the data exactly divides into
        # the given amount of batches, but probably not an issue with sufficient
        # training samples.
        batch_cutoff = n_splits % self.batch_size
        if batch_cutoff > 0:
            data = data[:-batch_cutoff]
        
        # Now separate the shuffled and windowed observations into target and
        # feature lists.
        # TODO: Maybe this step can be done before and both lists can instead
        # be shuffled separately with same seeds.
        target_list = []
        feature_list = []
        for window in data:
            target, features = np.hsplit(window,[1])
            target_list.append(target)
            feature_list.append(features)
        
        # Finally split the data into batches, consisting of a list of target
        # windows and a list of corresponding feature windows.
        n_batches = len(target_list) // self.batch_size
        batches = []
        
        # TODO: Create batch-indexes in a nicer way
        for i in range(n_batches):
            batches.append([target_list[i*self.batch_size:i*self.batch_size+self.batch_size],
                            feature_list[i*self.batch_size:i*self.batch_size+self.batch_size]])
        
        self.dataset_iterator = self._dataset_iterator(batches)      

In [291]:
ts = DataStreamerNilm(
    dataset = amp,
    mains_col = "MHE",
    appliance_cols = ["FGE", "UNE"],
    shuffle=False,
    window_size=8,
    batch_size=2)

In [289]:
ts.dataset.head(20)

Unnamed: 0_level_0,MHE,FGE,UNE
UNIX_TS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-04-01 07:00:00,873,0,138
2012-04-01 07:01:00,870,0,138
2012-04-01 07:02:00,827,0,138
2012-04-01 07:03:00,827,0,142
2012-04-01 07:04:00,729,0,45
2012-04-01 07:05:00,836,0,43
2012-04-01 07:06:00,823,0,37
2012-04-01 07:07:00,951,0,176
2012-04-01 07:08:00,925,0,152
2012-04-01 07:09:00,871,0,141


In [233]:
amp = pd.read_csv(AMPds_PATH + "Electricity_P.csv")
weather = pd.read_csv(AMPds_PATH + "Climate_HourlyWeather.csv")

amp.set_index(pd.to_datetime(amp.UNIX_TS, unit="s"), inplace=True)

In [293]:
y, X = ts.generate_batch()

In [296]:
y, X

([array([[2735],
         [2753],
         [2742],
         [2651],
         [2636],
         [2664],
         [2664],
         [2548]], dtype=int64),
  array([[2555],
         [2552],
         [2564],
         [2550],
         [2570],
         [2573],
         [2570],
         [2559]], dtype=int64)],
 [array([[127, 143],
         [127, 144],
         [125, 163],
         [124,  59],
         [123,  54],
         [122,  61],
         [122,  52],
         [  0,  47]], dtype=int64),
  array([[ 0, 55],
         [ 0, 56],
         [ 0, 64],
         [ 0, 53],
         [10, 54],
         [ 9, 53],
         [ 0, 64],
         [ 0, 60]], dtype=int64)])

In [167]:
res

array([[[ 925,    0,    0],
        [ 871,    0,    0],
        [ 880,    0,    0],
        [ 880,    0,    0],
        [2735,  127,    0]],

       [[2753,  127,    0],
        [2742,  125,    0],
        [2651,  124,    0],
        [ 880,    0,    0],
        [2298,    0,    0]],

       [[2625,  143,    0],
        [2701,  130,    0],
        [ 873,    0,    0],
        [ 870,    0,    0],
        [ 827,    0,    0]],

       [[ 827,    0,    0],
        [ 729,    0,    0],
        [ 836,    0,    0],
        [ 823,    0,    0],
        [ 951,    0,    0]]], dtype=int64)

In [191]:
arr = res[0]
np.hsplit(arr, [1])

[array([[ 925],
        [ 871],
        [ 880],
        [ 880],
        [2735]], dtype=int64),
 array([[  0,   0],
        [  0,   0],
        [  0,   0],
        [  0,   0],
        [127,   0]], dtype=int64)]

In [206]:
y_list = []
X_list = []
for i in res:
    y, X  = np.hsplit(i, [1])
    y_list.append(y)
    X_list.append(X)

In [108]:
f = ts.dataset.head(20)
batch_size = 4
f.to_numpy()

array([[ 873,    0,    0],
       [ 870,    0,    0],
       [ 827,    0,    0],
       [ 827,    0,    0],
       [ 729,    0,    0],
       [ 836,    0,    0],
       [ 823,    0,    0],
       [ 951,    0,    0],
       [ 925,    0,    0],
       [ 871,    0,    0],
       [ 880,    0,    0],
       [ 880,    0,    0],
       [ 880,    0,    0],
       [2298,    0,    0],
       [2625,  143,    0],
       [2701,  130,    0],
       [2735,  127,    0],
       [2753,  127,    0],
       [2742,  125,    0],
       [2651,  124,    0]], dtype=int64)

In [159]:
res = f.to_numpy().reshape((4, 5, -1))
res[:-2]

array([[[ 925,    0,    0],
        [ 871,    0,    0],
        [ 880,    0,    0],
        [ 880,    0,    0],
        [2735,  127,    0]],

       [[2753,  127,    0],
        [2742,  125,    0],
        [2651,  124,    0],
        [ 880,    0,    0],
        [2298,    0,    0]]], dtype=int64)

In [163]:
res1 = res.reshape((-1, 2, 5, 3))

In [204]:
batches = [l[i:i+2] for i in range(len(l)//2)]

In [205]:
batches[0]

[[array([[ 925],
         [ 871],
         [ 880],
         [ 880],
         [2735]], dtype=int64),
  array([[  0,   0],
         [  0,   0],
         [  0,   0],
         [  0,   0],
         [127,   0]], dtype=int64)],
 [array([[2753],
         [2742],
         [2651],
         [ 880],
         [2298]], dtype=int64),
  array([[127,   0],
         [125,   0],
         [124,   0],
         [  0,   0],
         [  0,   0]], dtype=int64)]]

In [82]:
amp.filter(["MHE"])


Unnamed: 0_level_0,MHE
UNIX_TS,Unnamed: 1_level_1
2012-04-01 07:00:00,873
2012-04-01 07:01:00,870
2012-04-01 07:02:00,827
2012-04-01 07:03:00,827
2012-04-01 07:04:00,729
...,...
2014-04-01 06:55:00,352
2014-04-01 06:56:00,340
2014-04-01 06:57:00,347
2014-04-01 06:58:00,352


In [48]:
t = [1, 2, 3]
#t = "cvx"
s = "asdf"
[x for x in [t] + [s]]

[[1, 2, 3], 'asdf']

In [56]:
[s] + t

['asdf', 1, 2, 3, 'asdf', 'asdf']