In [1]:
import json 
from pathlib import Path
import shutil

from wattile.entry_point import init_logging, create_input_dataframe, run_model
from wattile.data_reading import read_dataset_from_file
from wattile.buildings_processing import correct_predictor_columns, correct_timestamps, rolling_stats, input_data_split
from wattile.time_processing import add_processed_time_columns
from wattile.entry_point import run_model


PROJECT_DIRECTORY = Path().resolve().parent.parent

# read configs

In [2]:
"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
##################################################################################
# choose the configs file to use as an input
##################################################################################
# # main configs file
# with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
#     configs = json.load(f)
##################################################################################
# code testing configs file
with open(PROJECT_DIRECTORY / "tests" / "fixtures" / "test_configs.json", "r") as f:
    configs = json.load(f)
##################################################################################

exp_dir = PROJECT_DIRECTORY / "notebooks" / "exp_dir"
if exp_dir.exists():
    shutil.rmtree(exp_dir)
exp_dir.mkdir()

configs["exp_dir"] = str(exp_dir)
configs["data_dir"] = str(PROJECT_DIRECTORY / "data")

configs

{'building': 'Synthetic Site',
 'target_var': 'Synthetic Site Electricity Main Total Power',
 'start_time': '2018-01-01T00:00:00-07:00',
 'end_time': '2022-01-01T00:00:00-07:00',
 'data_time_interval_mins': 1,
 'predictor_columns': ['Synthetic Weather Station Dew Point Temperature',
  'Synthetic Weather Station Diffuse Horizontal Irradiance',
  'Synthetic Weather Station Direct Normal Irradiance',
  'Synthetic Weather Station Dry Bulb Temperature',
  'Synthetic Weather Station Global Horizontal Irradiance',
  'Synthetic Weather Station Relative Humidity',
  'Synthetic Weather Station Wind Speed'],
 'arch_version': 4,
 'exp_id': 'Debugging_for_rolling_stats',
 'arch_type': 'RNN',
 'arch_type_variant': 'lstm',
 'transformation_method': 'minmaxscale',
 'train_batch_size': 5,
 'val_batch_size': 1,
 'convert_csvs': False,
 'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir',
 'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcamp

# read input data

In [3]:
datatype = "complete example data" # complete example data, incomplete example data, incomplete small example data
incompleteness = True
# col_test = ['Synthetic Weather Station Direct Normal Irradiance']
col_test = []

In [4]:
import pandas as pd
from pandas.tseries.frequencies import to_offset
import numpy as np
import plotly.graph_objects as go

In [5]:
if datatype == "complete example data":
    """
    Firstly, we will read the raw data from the dataset. 
    Checkout the docs for an indepth explaination of necessary dataset structure.
    """
    data = read_dataset_from_file(configs)
    data
    
    if incompleteness == True:
        
        # data_temp = data.loc["2021-12-01":"2021-12-01" :,].copy()
        data_temp = data.copy()
        data_temp

        # adding irregular measurement intervals
        list_cols = ['Synthetic Weather Station Dew Point Temperature', 'Synthetic Weather Station Diffuse Horizontal Irradiance', 'Synthetic Weather Station Global Horizontal Irradiance']
        list_interval_mins = [3, 5, 7]
        list_timeshift_mins = [0, 3, 7]
        
        i=0
    
        for col, timestep, loffset in zip(list_cols, list_interval_mins, list_timeshift_mins):

            print("resampling and shifting column = {} with resampling timestep of {} and time-shift of {}".format(col, timestep, loffset))

            minutes = str(timestep) + "T"
            loffset = str(loffset) + "min" 
            df_temp = data_temp[col].resample(minutes).mean().copy()
            df_temp.index = df_temp.index + to_offset(loffset)
            data_temp[col] = df_temp

        # adding NaNs in random places
        fraction = 0.1
        list_index_random = data_temp.sample(frac=fraction, replace=False, random_state=1).index.tolist()
        list_column_random = pd.DataFrame(data_temp.columns).sample(frac=fraction, replace=False, random_state=2).iloc[:,0].tolist()

        i=0
        for ind in list_index_random:

            for col in list_column_random:

                #print("replacing value in index = {} and column = {} to blank".format(ind, col))
                data_temp.loc[ data_temp.index==ind , data_temp.columns==col ] = np.NAN
                
        # adding irregular/random timestamps
        def random_dates(start, end, n):

            start_u = start.value//10**9
            end_u = end.value//10**9

            return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
        
        np.random.seed(seed=1)
        start = data_temp.index[0]
        end = data_temp.index[-1]
        n = data_temp.shape[0]
        datetime_random = random_dates(start, end, n)
        datetime_random = datetime_random.sort_values()
        datetime_random
        data_temp.index = datetime_random
        
        if col_test==[]:
            data_test = data_temp.copy()
        else:
            data_test = data_temp.loc[:, data_temp.columns.isin(col_test)]
            
elif datatype == "incomplete small example data1":

    data_test = [
        [
            "01:00:00",
            "01:01:53",
            "01:03:17",
            "01:04:02",
            "01:04:59",
            "01:05:00",
            "01:06:22",
            "01:09:46",
            "01:10:00",
            "01:11:22",
            "01:13:44",
            "01:14:26",
            "01:15:00"
        ],
        [np.nan, 1.5, 2.2, 0.9, 3.6, np.nan, 3.3, 2.3, np.nan, 1.3, 4.3, 4.1, np.nan],
        [1.0, np.nan, np.nan, np.nan, np.nan, 2.0, np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 4.0]
    ]

    data_test = pd.DataFrame(data_test).T
    data_test.columns = ['ts', 'var1', 'var2']
    data_test['var1'] = data_test['var1'].astype(float)
    data_test['var2'] = data_test['var2'].astype(float)
    data_test['ts'] = pd.to_datetime(data_test.ts)
    data_test = data_test.set_index('ts')
    
elif datatype == "incomplete small example data2":
    data_test = pd.read_csv(
        "../../tests/fixtures/rolling_stats_input.csv", 
        index_col=0,
    )
    data_test['var1'] = pd.to_numeric(data_test['var1'], errors='coerce')
    data_test['var2'] = pd.to_numeric(data_test['var2'], errors='coerce')
    data_test['var1'] = data_test['var1'].astype(float)
    data_test['var2'] = data_test['var2'].astype(float)
    data_test.index = pd.to_datetime(data_test.index, exact=False, utc=True)
    data_test = data_test[['var1','var2']]
    
data_test

resampling and shifting column = Synthetic Weather Station Dew Point Temperature with resampling timestep of 3 and time-shift of 0
resampling and shifting column = Synthetic Weather Station Diffuse Horizontal Irradiance with resampling timestep of 5 and time-shift of 3
resampling and shifting column = Synthetic Weather Station Global Horizontal Irradiance with resampling timestep of 7 and time-shift of 7


Unnamed: 0,Synthetic Weather Station Dew Point Temperature,Synthetic Weather Station Diffuse Horizontal Irradiance,Synthetic Weather Station Direct Normal Irradiance,Synthetic Weather Station Dry Bulb Temperature,Synthetic Weather Station Global Horizontal Irradiance,Synthetic Weather Station Relative Humidity,Synthetic Weather Station Wind Speed,Synthetic Site Electricity Main Total Power
2021-12-01 07:02:42,15.162953,,-0.005199,56.408001,,17.930000,10.457981,104.800003
2021-12-01 07:02:59,,,0.207964,56.174000,,17.879999,11.265539,96.650002
2021-12-01 07:04:03,,,0.457520,55.886002,,17.809999,12.777752,96.199997
2021-12-01 07:04:24,14.883944,-1.430474,0.473117,55.796001,,17.820000,12.694983,95.599998
2021-12-01 07:05:47,,,0.545903,55.723999,,17.820000,11.632407,100.650002
...,...,...,...,...,...,...,...,...
2021-12-08 06:51:52,,,-0.254757,42.285198,,24.680000,0.000000,109.664803
2021-12-08 06:53:18,,,-0.233960,41.997200,,24.930000,0.000000,107.002800
2021-12-08 06:53:19,11.276115,,-0.145575,41.669601,,25.290001,3.545647,106.480400
2021-12-08 06:53:56,,-1.392072,0.171570,41.180000,,25.920000,1.386941,110.419998


# pre-process data including feature extraction

In [6]:
# assert we have the correct columns and order them
data = correct_predictor_columns(configs, data)

In [7]:
# sort and trim data specified time period
data = correct_timestamps(configs, data)

In [8]:
# Add time-based features
data = add_processed_time_columns(data, configs)

In [9]:
data = rolling_stats(data, configs)

In [10]:
data

Unnamed: 0_level_0,Synthetic Weather Station Dew Point Temperature_min,Synthetic Weather Station Diffuse Horizontal Irradiance_min,Synthetic Weather Station Direct Normal Irradiance_min,Synthetic Weather Station Dry Bulb Temperature_min,Synthetic Weather Station Global Horizontal Irradiance_min,Synthetic Weather Station Relative Humidity_min,Synthetic Weather Station Wind Speed_min,sin_HOD_min,cos_HOD_min,DOW_binary_reg_0_min,...,DOW_binary_reg_0_mean,DOW_binary_reg_1_mean,DOW_binary_reg_2_mean,DOW_binary_reg_3_mean,DOW_binary_reg_4_mean,DOW_binary_reg_5_mean,DOW_binary_reg_6_mean,sin_MOY_mean,cos_MOY_mean,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.525850,-0.005199,56.408001,-1.810825,17.930000,10.457981,0.965926,-2.588190e-01,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,104.800003
2021-12-01 07:15:00+00:00,14.782966,-1.496226,-0.540707,55.112000,-1.873579,17.809999,9.395406,0.946930,-3.214395e-01,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,103.650002
2021-12-01 07:30:00+00:00,15.474850,-1.379412,-1.741706,54.608002,-1.918852,18.690001,8.612455,0.923880,-3.826834e-01,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,98.050003
2021-12-01 07:45:00+00:00,16.614972,-1.642929,-2.147240,53.743999,-2.029242,19.320000,4.503084,0.896873,-4.422887e-01,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,98.349998
2021-12-01 08:00:00+00:00,16.413343,-1.480409,-0.426325,52.807999,-1.958610,21.090000,3.746977,0.866025,-5.000000e-01,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,102.949997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 06:00:00+00:00,11.961447,-1.673728,-1.211398,41.388802,-1.909901,25.160000,0.000000,0.998135,-1.608123e-16,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.385663,0.922640,104.668396
2021-12-08 06:15:00+00:00,11.008480,-1.489343,-0.504314,41.646198,-1.936419,24.240000,0.000000,0.997859,-6.540313e-02,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.385663,0.922640,102.709602
2021-12-08 06:30:00+00:00,10.851442,-1.571301,-0.655083,42.024200,-1.834018,23.760000,0.000000,0.991445,-1.305262e-01,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.385663,0.922640,105.972198
2021-12-08 06:45:00+00:00,10.827111,-1.430384,-0.826657,40.960400,-1.865315,24.330000,0.000000,0.980785,-1.950903e-01,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.385663,0.922640,112.989998


In [11]:
train_df, val_df = input_data_split(data, configs)

In [12]:
data.shape

(673, 55)

In [13]:
train_df.shape

(528, 55)

In [14]:
val_df.shape

(72, 55)

# test 2D-3D data conversion for S2S

### original version

In [15]:
"""
def generate_windows(data):
    
    # an important procedure to convert 2-dimensional data into 3-dimensional for modeling
    
    x_train = []
    y_usage_train = []
    x_test = []
    y_usage_test = []

    # for training data
    idxs = np.random.choice(
        train_source.shape[0] - (WINDOW_SOURCE_SIZE + WINDOW_TARGET_SIZE),
        train_source.shape[0] - (WINDOW_SOURCE_SIZE + WINDOW_TARGET_SIZE),
        replace=False,
    )

    for idx in idxs:
        x_train.append(
            train_source[idx : idx + WINDOW_SOURCE_SIZE].reshape(
                (1, WINDOW_SOURCE_SIZE, train_source.shape[1])
            )
        )
        y_usage_train.append(
            train_source[
                idx
                + WINDOW_SOURCE_SIZE : idx
                + WINDOW_SOURCE_SIZE
                + WINDOW_TARGET_SIZE,
                -1,
            ].reshape((1, WINDOW_TARGET_SIZE, 1))
        )

    x_train = np.concatenate(x_train, axis=0)  # make them arrays and not lists
    y_usage_train = np.concatenate(y_usage_train, axis=0)

    # for testing data
    idxs = np.arange(
        0,
        len(test_source) - (WINDOW_SOURCE_SIZE + WINDOW_TARGET_SIZE),
        WINDOW_TARGET_SIZE,
    )

    for idx in idxs:
        x_test.append(
            test_source[idx : idx + WINDOW_SOURCE_SIZE].reshape(
                (1, WINDOW_SOURCE_SIZE, test_source.shape[1])
            )
        )
        y_usage_test.append(
            test_source[
                idx
                + WINDOW_SOURCE_SIZE : idx
                + WINDOW_SOURCE_SIZE
                + WINDOW_TARGET_SIZE,
                -1,
            ].reshape((1, WINDOW_TARGET_SIZE, 1))
        )

    x_test = np.concatenate(x_test, axis=0)  # make them arrays and not lists
    y_usage_test = np.concatenate(y_usage_test, axis=0)

    return x_train, y_usage_train, x_test, y_usage_test
"""

'\ndef generate_windows(data):\n    \n    # an important procedure to convert 2-dimensional data into 3-dimensional for modeling\n    \n    x_train = []\n    y_usage_train = []\n    x_test = []\n    y_usage_test = []\n\n    # for training data\n    idxs = np.random.choice(\n        train_source.shape[0] - (WINDOW_SOURCE_SIZE + WINDOW_TARGET_SIZE),\n        train_source.shape[0] - (WINDOW_SOURCE_SIZE + WINDOW_TARGET_SIZE),\n        replace=False,\n    )\n\n    for idx in idxs:\n        x_train.append(\n            train_source[idx : idx + WINDOW_SOURCE_SIZE].reshape(\n                (1, WINDOW_SOURCE_SIZE, train_source.shape[1])\n            )\n        )\n        y_usage_train.append(\n            train_source[\n                idx\n                + WINDOW_SOURCE_SIZE : idx\n                + WINDOW_SOURCE_SIZE\n                + WINDOW_TARGET_SIZE,\n                -1,\n            ].reshape((1, WINDOW_TARGET_SIZE, 1))\n        )\n\n    x_train = np.concatenate(x_train, axis=0)  # ma

### updated version

In [16]:
window_source_size = 12
window_target_size = 2

In [17]:
train_df = train_df.values
val_df = val_df.values

In [18]:
train_predictor = []
train_target = []
valid_predictor = []
valid_target = []

# for training data
idxs = np.random.choice(
    train_df.shape[0] - (window_source_size + window_target_size),
    train_df.shape[0] - (window_source_size + window_target_size),
    replace=False,
)

In [19]:
idxs

array([ 47, 244, 277, 366, 505, 221, 241, 315, 129, 213, 425, 111, 488,
       187, 329, 412, 247, 405, 512, 218, 462, 360, 246, 258, 237, 482,
       508,  17, 439, 419, 233, 232, 300, 470,  41, 406, 179, 370, 264,
       293, 376, 189, 159,  85, 446, 250,  29,  49,   0,  66, 443,  92,
       510,  65, 334, 486, 223, 107,   9, 426,  90, 214, 491, 341, 433,
       124, 162, 252, 120, 254,  40, 346, 336, 119,  38, 402, 368,  34,
        67, 225, 172, 364, 283, 411, 284, 502,  62, 422, 490,  31, 242,
       434, 444,   5,  59, 350, 442, 466, 509, 438, 423, 392, 386, 468,
       413, 393, 165,   3,  68, 157, 147, 209,   4, 207, 379, 396, 260,
        19, 236, 324, 196, 135, 132, 117, 160, 504,  78, 400, 453, 328,
        42, 267, 436, 133, 289, 404, 431, 173, 448,  16,   7, 274, 344,
       180, 102, 318, 186, 146, 407, 128, 477, 286, 110, 347, 410, 311,
       323, 142, 409,   6, 399,  11, 473, 463, 493,  58, 467,  26, 382,
       292, 192, 101, 201, 104, 375, 312, 316,  18, 168, 385, 12

In [20]:
for idx in idxs:
    train_predictor.append(
        train_df[idx : idx + window_source_size].reshape(
            (1, window_source_size, train_df.shape[1])
        )
    )
    train_target.append(
        train_df[
            idx
            + window_source_size : idx
            + window_source_size
            + window_target_size,
            -1,
        ].reshape((1, window_target_size, 1))
    )

In [21]:
type(train_predictor)

list

In [22]:
type(train_target)

list

In [23]:
train_predictor = np.concatenate(train_predictor, axis=0)  # make them arrays and not lists
train_target = np.concatenate(train_target, axis=0)

In [24]:
type(train_predictor)

numpy.ndarray

In [25]:
type(train_target)

numpy.ndarray

In [26]:
# for validation data
idxs = np.arange(
    0,
    len(val_df) - (window_source_size + window_target_size),
    window_target_size,
)

In [27]:
idxs

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
       34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56])

In [28]:
for idx in idxs:
    valid_predictor.append(
        val_df[idx : idx + window_source_size].reshape(
            (1, window_source_size, val_df.shape[1])
        )
    )
    valid_target.append(
        val_df[
            idx
            + window_source_size : idx
            + window_source_size
            + window_target_size,
            -1,
        ].reshape((1, window_target_size, 1))
    )

In [29]:
type(valid_predictor)

list

In [30]:
type(valid_target)

list

In [31]:
valid_predictor = np.concatenate(valid_predictor, axis=0)  # make them arrays and not lists
valid_target = np.concatenate(valid_target, axis=0)

In [32]:
type(valid_predictor)

numpy.ndarray

In [33]:
type(valid_target)

numpy.ndarray

# cleaned version

In [34]:
def roll_full_data_s2s(train_df, val_df, configs):
    
    # reading configuration settings
    window_source_size = 12 # TODO: replace this with configs param later
    window_target_size = 2 # TODO: replace this with configs param later
    
    train_df = train_df.values
    val_df = val_df.values
    
    # initialize lists
    train_predictor = []
    train_target = []
    valid_predictor = []
    valid_target = []

    # create rolling window data for both predictor and target and for training data set
    idxs = np.random.choice(
        train_df.shape[0] - (window_source_size + window_target_size),
        train_df.shape[0] - (window_source_size + window_target_size),
        replace=False,
    )
    for idx in idxs:
        train_predictor.append(
            train_df[idx : idx + window_source_size].reshape(
                (1, window_source_size, train_df.shape[1])
            )
        )
        train_target.append(
            train_df[
                idx
                + window_source_size : idx
                + window_source_size
                + window_target_size,
                -1,
            ].reshape((1, window_target_size, 1))
        )
    # convert to numpy array
    train_predictor = np.concatenate(train_predictor, axis=0)
    train_target = np.concatenate(train_target, axis=0)
    
    # create rolling window data for both predictor and target and for validation data set
    idxs = np.arange(
        0,
        len(val_df) - (window_source_size + window_target_size),
        window_target_size,
    )
    for idx in idxs:
        valid_predictor.append(
            val_df[idx : idx + window_source_size].reshape(
                (1, window_source_size, val_df.shape[1])
            )
        )
        valid_target.append(
            val_df[
                idx
                + window_source_size : idx
                + window_source_size
                + window_target_size,
                -1,
            ].reshape((1, window_target_size, 1))
        )
    # convert to numpy array
    valid_predictor = np.concatenate(valid_predictor, axis=0)  # make them arrays and not lists
    valid_target = np.concatenate(valid_target, axis=0)
    
    return train_predictor, train_target, valid_predictor, valid_target