In [1]:
import numpy as np
import pandas as pd
import random
import time
import math
import os
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import init
from dateutil import parser
from pathlib import Path
import json 
import shutil
import logging
logger = logging.getLogger(str(os.getpid()))

from wattile.data_reading import read_dataset_from_file
from wattile.buildings_processing import correct_predictor_columns, correct_timestamps, resample_or_rolling_stats
from wattile.time_processing import add_processed_time_columns
PROJECT_DIRECTORY = Path().resolve().parent.parent

PROJECT_DIRECTORY = C:\Users\JKIM4\Anaconda3\envs\wattile\Lib\site-packages\wattile


In [2]:
import copy

# reading configs

In [3]:
"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
##################################################################################
# choose the configs file to use as an input
##################################################################################
# main configs file
with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
    configs = json.load(f)
##################################################################################
# code testing configs file
# with open(PROJECT_DIRECTORY / "tests" / "fixtures" / "test_configs.json", "r") as f:
#     configs = json.load(f)
##################################################################################

exp_dir = PROJECT_DIRECTORY / "notebooks" / "exp_dir"
if exp_dir.exists():
    shutil.rmtree(exp_dir)
exp_dir.mkdir()

configs["data_input"]["exp_dir"] = str(PROJECT_DIRECTORY / exp_dir)
configs["data_input"]["data_dir"] = str(PROJECT_DIRECTORY / "data" / "Synthetic Site")

configs

{'data_input': {'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\data\\Synthetic Site',
  'data_config': 'Synthetic Site Config.json',
  'start_time': '2018-01-01T00:00:00-07:00',
  'end_time': '2022-01-01T00:00:00-07:00',
  'predictor_columns': ['Synthetic Weather Station Dew Point Temperature',
   'Synthetic Weather Station Diffuse Horizontal Irradiance',
   'Synthetic Weather Station Direct Normal Irradiance',
   'Synthetic Weather Station Dry Bulb Temperature',
   'Synthetic Weather Station Global Horizontal Irradiance',
   'Synthetic Weather Station Relative Humidity',
   'Synthetic Weather Station Wind Speed'],
  'target_var': 'Synthetic Site Electricity Main Total Power',
  'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir'},
 'data_output': {'exp_dir': 'exp_dir',
  'plot_comparison': True,
  'plot_comparison_portion_start': 0.0,
  'plot_comparison_portion_end': 1.0},
 'data_processing': {'fea

# reading data

In [4]:
def get_dummy_data(start, end, iterval):
    data = pd.DataFrame(index=pd.date_range(start, end, freq=iterval))
    data["var_1"] = data.index.month * 1000000 + data.index.day * 10000 + data.index.hour * 100 + data.index.minute
    data["target_var"] = -1 * (data["var_1"])

    return data

In [5]:
####################################################################
# read "our" synthetic data
####################################################################
configs["target_feat_name"] = [configs["data_input"]["target_var"]]
data, congfigs = read_dataset_from_file(configs)
####################################################################
# create much simpler dummy data
####################################################################
target_var = configs["data_input"]["target_var"]
data = get_dummy_data(data.index[0], data.index[-1], "1min")
data = data.rename(columns={"target_var":target_var})

data

Unnamed: 0,var_1,Synthetic Site Electricity Main Total Power
2021-12-01 07:00:00+00:00,12010700,-12010700
2021-12-01 07:01:00+00:00,12010701,-12010701
2021-12-01 07:02:00+00:00,12010702,-12010702
2021-12-01 07:03:00+00:00,12010703,-12010703
2021-12-01 07:04:00+00:00,12010704,-12010704
...,...,...
2021-12-08 06:55:00+00:00,12080655,-12080655
2021-12-08 06:56:00+00:00,12080656,-12080656
2021-12-08 06:57:00+00:00,12080657,-12080657
2021-12-08 06:58:00+00:00,12080658,-12080658


# test setting

In [6]:
configs["learning_algorithm"]["arch_version"] = "alfa"

configs["data_processing"]["resample"]["bin_interval"] = "15min"

configs["data_processing"]["feat_timelag"]["lag_interval"] = "15min"
configs["data_processing"]["feat_timelag"]["lag_count"] = 4

configs["data_processing"]["input_output_window"]["window_width_source"] = "180min"
configs["data_processing"]["input_output_window"]["window_width_target"] = "60min"
configs["data_processing"]["input_output_window"]["window_width_futurecast"] = "30min"

configs["learning_algorithm"]["use_case"] = "prediction"

# data processing

In [7]:
# assert we have the correct columns and order them
# data = correct_predictor_columns(configs, data)

# sort and trim data specified time period
# data = correct_timestamps(configs, data)

# Add time-based features
data = add_processed_time_columns(data, configs)

# Add statistics features
data = resample_or_rolling_stats(data, configs)

data

Unnamed: 0,var_1_min,sin_HOD_min,cos_HOD_min,HOD_binary_reg_0_min,HOD_binary_reg_1_min,HOD_binary_reg_2_min,HOD_binary_reg_3_min,HOD_binary_reg_4_min,HOD_binary_reg_5_min,HOD_binary_reg_6_min,...,DOW_binary_fuzzy_0_mean,DOW_binary_fuzzy_1_mean,DOW_binary_fuzzy_2_mean,DOW_binary_fuzzy_3_mean,DOW_binary_fuzzy_4_mean,DOW_binary_fuzzy_5_mean,DOW_binary_fuzzy_6_mean,sin_MOY_mean,cos_MOY_mean,Synthetic Site Electricity Main Total Power
2021-12-01 07:00:00+00:00,12010700.0,0.965926,-2.588190e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,-12010700
2021-12-01 07:15:00+00:00,12010701.0,0.946930,-3.214395e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,-12010715
2021-12-01 07:30:00+00:00,12010716.0,0.923880,-3.826834e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,-12010730
2021-12-01 07:45:00+00:00,12010731.0,0.896873,-4.422887e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,-12010745
2021-12-01 08:00:00+00:00,12010746.0,0.866025,-5.000000e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.705556,0.294444,0.0,0.0,0.0,-0.493776,0.869589,-12010800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 06:00:00+00:00,12080546.0,0.998135,-1.608123e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.788889,0.211111,0.0,0.0,0.0,-0.385663,0.922640,-12080600
2021-12-08 06:15:00+00:00,12080601.0,0.997859,-6.540313e-02,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,-12080615
2021-12-08 06:30:00+00:00,12080616.0,0.991445,-1.305262e-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,-12080630
2021-12-08 06:45:00+00:00,12080631.0,0.980785,-1.950903e-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,-12080645


# test

### set timestamp for casting

- this is the timestamp that will be the basis for calculating relative window sizes
- and representing what time "now" is

In [8]:
timestamp_cast = pd.to_datetime("2021-12-06 13:30:00+00:00")
timestamp_cast

Timestamp('2021-12-06 13:30:00+0000', tz='UTC')

- below is ground truth target measurement(s) we want to predict from now (=`timestamp_cast`)

In [9]:
timestamp_predict = timestamp_cast + pd.Timedelta(configs["data_processing"]["input_output_window"]["window_width_futurecast"])
groundtruth_target = data.loc[data.index==timestamp_predict, configs["data_input"]["target_var"]].iloc[0]
data.loc[data.index==timestamp_predict, configs["data_input"]["target_var"]]

2021-12-06 14:00:00+00:00   -12061400
Freq: 15T, Name: Synthetic Site Electricity Main Total Power, dtype: int64

- so this value below is what we want to predict with `timestamp_cast` in deployment scenario

In [10]:
groundtruth_target

-12061400

### calculate input window width

- check if the script is up-to-date

In [11]:
#########################################################################
# FOR ALFA
#########################################################################
def get_input_window_for_output_time_alfa(datetime):
    """Given the time for which we want to predict, return the time window of the required
    input.
    :param output_time: the time for which we want to predict
    :type output_time: datatime
    :return: earliest time input should include, latest time input should include.
    :rtype: dt.datetime, datetime
    """

    # set prediction time with pandas timedelta
    timestamp_cast = pd.to_datetime(datetime)  # current time needs to go in here

    # set parameters
    config_data_processing = configs["data_processing"]
    lag_interval = config_data_processing["feat_timelag"]["lag_interval"]
    lag_count = config_data_processing["feat_timelag"]["lag_count"]
    config_input_output_window = config_data_processing["input_output_window"]
    window_width_futurecast = config_input_output_window["window_width_futurecast"]

    # calculating offsets
    window_start_offset = pd.Timedelta(lag_interval) * lag_count + pd.Timedelta(
        window_width_futurecast
    )
    window_end_offset = pd.Timedelta(window_width_futurecast)

    # calculating start and end time windows for input data
    prediction_window_start_time = timestamp_cast - window_start_offset
    prediction_window_end_time = timestamp_cast - window_end_offset

    return prediction_window_start_time, prediction_window_end_time


#########################################################################
# FOR BRAVO
#########################################################################
def get_input_window_for_output_time_bravo(datetime):
    """Given the time for which we want to predict, return the time window of the required
    input.
    :param output_time: the time for which we want to predict
    :type output_time: datatime
    :return: earliest time input should include, latest time input should include.
    :rtype: dt.datetime, datetime
    """

    # set prediction time with pandas timedelta
    timestamp_cast = pd.to_datetime(datetime)  # current time needs to go in here

    # set parameters
    config_data_processing = configs["data_processing"]
    lag_interval = config_data_processing["feat_timelag"]["lag_interval"]
    lag_count = config_data_processing["feat_timelag"]["lag_count"]
    config_input_output_window = config_data_processing["input_output_window"]
    window_width_futurecast = config_input_output_window["window_width_futurecast"]

    # calculating offsets
    window_start_offset = pd.Timedelta(lag_interval) * lag_count + pd.Timedelta(window_width_futurecast)
    window_end_offset = pd.Timedelta(window_width_futurecast)

    # calculating start and end time windows for input data
    prediction_window_start_time = timestamp_cast - window_start_offset
    prediction_window_end_time = timestamp_cast - window_end_offset

    return prediction_window_start_time, prediction_window_end_time


#########################################################################
# FOR CHARLIE
#########################################################################
def get_input_window_for_output_time_charlie(datetime):
    """Given the time for which we want to predict, return the time window of the required
    input.

    :param output_time: the time for which we want to predict
    :type output_time: datatime
    :return: earliest time input should include, latest time input should include.
    :rtype: dt.datetime, datetime
    """

    # set prediction time with pandas timedelta
    timestamp_cast = pd.to_datetime(datetime)  # current time needs to go in here

    # set parameters
    config_data_processing = configs["data_processing"]
    config_input_output_window = config_data_processing["input_output_window"]
    window_width_source = config_input_output_window["window_width_source"]
    window_width_target = config_input_output_window["window_width_target"]
    window_width_futurecast = config_input_output_window["window_width_futurecast"]

    # calculating offsets
    window_start_offset = pd.Timedelta(window_width_source) + pd.Timedelta(
        window_width_futurecast
    )
    window_end_offset = pd.Timedelta("0min")

    # calculating start and end time windows for input data
    prediction_window_start_time = timestamp_cast - window_start_offset
    prediction_window_end_time = timestamp_cast - window_end_offset

    return prediction_window_start_time, prediction_window_end_time

In [12]:
if configs["learning_algorithm"]["arch_version"] == "alfa":
    prediction_window_start_time, prediction_window_end_time = get_input_window_for_output_time_alfa(timestamp_cast)
    
elif configs["learning_algorithm"]["arch_version"] == "bravo":
    prediction_window_start_time, prediction_window_end_time = get_input_window_for_output_time_bravo(timestamp_cast)
    
elif configs["learning_algorithm"]["arch_version"] == "charlie":
    prediction_window_start_time, prediction_window_end_time = get_input_window_for_output_time_charlie(timestamp_cast)

print("prediction_window_start_time = {} | prediction_window_end_time = {}".format(prediction_window_start_time, prediction_window_end_time))

prediction_window_start_time = 2021-12-06 12:00:00+00:00 | prediction_window_end_time = 2021-12-06 13:00:00+00:00


### trim data based on input window size

- trim data based on previous output and before timelag data processing = `data_test`
- also retain data without trimming to set as a reference = `data_ref`

In [13]:
data_test = data.loc[prediction_window_start_time:prediction_window_end_time, :]
data_ref = copy.deepcopy(data)
data_test

Unnamed: 0,var_1_min,sin_HOD_min,cos_HOD_min,HOD_binary_reg_0_min,HOD_binary_reg_1_min,HOD_binary_reg_2_min,HOD_binary_reg_3_min,HOD_binary_reg_4_min,HOD_binary_reg_5_min,HOD_binary_reg_6_min,...,DOW_binary_fuzzy_0_mean,DOW_binary_fuzzy_1_mean,DOW_binary_fuzzy_2_mean,DOW_binary_fuzzy_3_mean,DOW_binary_fuzzy_4_mean,DOW_binary_fuzzy_5_mean,DOW_binary_fuzzy_6_mean,sin_MOY_mean,cos_MOY_mean,Synthetic Site Electricity Main Total Power
2021-12-06 12:00:00+00:00,12061146.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.538889,0.461111,0.0,0.0,0.0,0.0,0.0,-0.417194,0.908818,-12061200
2021-12-06 12:15:00+00:00,12061201.0,-0.06540313,-0.99999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.5,0.0,0.0,0.0,0.0,0.0,-0.417194,0.908818,-12061215
2021-12-06 12:30:00+00:00,12061216.0,-0.1305262,-0.997564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.5,0.0,0.0,0.0,0.0,0.0,-0.417194,0.908818,-12061230
2021-12-06 12:45:00+00:00,12061231.0,-0.1950903,-0.990866,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.5,0.0,0.0,0.0,0.0,0.0,-0.417194,0.908818,-12061245
2021-12-06 13:00:00+00:00,12061246.0,-0.258819,-0.979925,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.497222,0.502778,0.0,0.0,0.0,0.0,0.0,-0.417194,0.908818,-12061300


### final data processing for reflecting timelags

- check if the script is up-to-date

In [14]:
def timelag_predictors(data, configs):
    
    """
    Create lagged versions of predictor variables in a DataFrame.
    Used specifically for alfa learning methods.
    :param data: (DataFrame)
    :param configs: (Dict)
    :return: (DataFrame)
    """

    # reading configuration parameters
    lag_interval = configs["data_processing"]["feat_timelag"]["lag_interval"]
    lag_count = configs["data_processing"]["feat_timelag"]["lag_count"]
    window_width_futurecast = configs["data_processing"]["input_output_window"][
        "window_width_futurecast"
    ]
    target_var = configs["data_input"]["target_var"]

    # splitting predictors and target
    target = data[target_var]
    data = data.drop(target_var, axis=1)
    data_orig = data

    # padding predictors
    temp_holder = list()
    temp_holder.append(data_orig)
    for i in range(1, lag_count + 1):
        shifted = (
            data_orig.shift(freq=i * lag_interval)
            .astype("float32")
            .add_suffix("_lag{}".format(i))
        )
        temp_holder.append(shifted)
    temp_holder.reverse()
    data = pd.concat(temp_holder, axis=1)

    if configs["learning_algorithm"]["use_case"] != "prediction":
        data[target_var] = target.shift(freq="-" + window_width_futurecast)
    # -------------------------------------------------------------------------------
    else:
        data_groundtruth = copy.deepcopy(data)
        data_groundtruth[target_var] = target.shift(freq="-" + window_width_futurecast)
    # -------------------------------------------------------------------------------

    data = data.dropna(how="any")
    
    return data, data_groundtruth

def timelag_predictors_target(data, configs):
    
    """
    Create lagged versions of predictor and target variables in a DataFrame.
    Used specifically for bravo learning methods.
    :param data: (DataFrame)
    :param configs: (Dict)
    :return: (DataFrame)
    """

    # reading configuration parameters
    lag_interval = configs["data_processing"]["feat_timelag"]["lag_interval"]
    lag_count = configs["data_processing"]["feat_timelag"]["lag_count"]
    window_width_target = configs["data_processing"]["input_output_window"][
        "window_width_target"
    ]
    window_width_futurecast = configs["data_processing"]["input_output_window"][
        "window_width_futurecast"
    ]
    bin_interval = configs["data_processing"]["resample"]["bin_interval"]
    initial_num = (pd.Timedelta(window_width_target) // pd.Timedelta(bin_interval)) + 1
    target_var = configs["data_input"]["target_var"]
    target_temp = data[target_var].copy()

    # shift target for futurecast
    data[target_var] = target_temp.shift(freq="-" + window_width_futurecast)

    # split predictors and target
    target = data[target_var]
    data = data.drop(target_var, axis=1)
    data_orig = data

    # Pad the exogenous variables
    temp_holder = list()
    temp_holder.append(data_orig)
    for i in range(1, lag_count + 1):
        shifted = (
            data_orig.shift(freq=i * lag_interval)
            .astype("float32")
            .add_suffix("_lag{}".format(i))
        )
        temp_holder.append(shifted)
    temp_holder.reverse()
    data = pd.concat(temp_holder, axis=1)

    # Do fine padding for future predictions. Create a new df to preserve memory usage.
    local = pd.DataFrame()
    for i in range(0, initial_num):
        if i == 0:
            local["{}_lag_{}".format(target_var, i)] = target.shift(i)
        else:
            local["{}_lag_{}".format(target_var, i)] = target.shift(
                freq="-" + (i * bin_interval)
            )

    if configs["learning_algorithm"]["use_case"] != "prediction":
        data = pd.concat([data, local], axis=1)

    data = data.dropna(how="any")
    
    return data

def roll_predictors_target(data, configs):
    
    """
    Create rolling windows of predictor and target variables in a DataFrame.
    Used specifically for charlie learning methods.
    :param data: (DataFrame)
    :param configs: (Dict)
    :return: (Dict)
    """

    # setting configuration parameters
    window_width_source = configs["data_processing"]["input_output_window"][
        "window_width_source"
    ]
    window_width_futurecast = configs["data_processing"]["input_output_window"][
        "window_width_futurecast"
    ]
    window_width_target = configs["data_processing"]["input_output_window"][
        "window_width_target"
    ]
    bin_interval = configs["data_processing"]["resample"]["bin_interval"]
    target_var = configs["data_input"]["target_var"]

    # initialize lists
    data_predictor = []
    data_target = []

    # calculate number of rows based on window size defined by time
    window_source_size_count = pd.Timedelta(window_width_source) // pd.Timedelta(
        bin_interval
    )
    window_target_size_count = pd.Timedelta(window_width_target) // pd.Timedelta(
        bin_interval
    )
    window_futurecast_size_count = pd.Timedelta(
        window_width_futurecast
    ) // pd.Timedelta(bin_interval)

    # set aside timeindex
    timestamp = data.iloc[
        window_source_size_count : -(
            window_target_size_count + window_futurecast_size_count
        ),
        :,
    ].index

    # if use case is training/validation, 
    # remove data at the end to properly match predictor and target entries
    if configs["learning_algorithm"]["use_case"] != "prediction":
        data_shifted_predictor = data.iloc[
            : -(window_target_size_count + window_futurecast_size_count), :
        ].loc[:, data.columns != target_var]
    # if use case is prediction (which only requires predictors only at the end), 
    # do not remove data at the end
    else:
        data_shifted_predictor = data.loc[:, data.columns != target_var]
        
    # create 3D predictor data based on rolling window
    for window in data_shifted_predictor.rolling(
        window=window_width_source, closed="both"
    ):
        if window.shape[0] == window_source_size_count + 1:
            data_predictor.append(
                window.values.reshape(
                    (1, window_source_size_count + 1, data_shifted_predictor.shape[1])
                )
            )
            
    # reshape data dimension
    data_predictor = np.concatenate(np.array(data_predictor), axis=0)

    # if use case is training/validation, 
    # remove data at the end to properly match predictor and target entries
#     if configs["learning_algorithm"]["use_case"] != "prediction":
    data_shifted_target = data.iloc[
        (window_source_size_count + window_futurecast_size_count) :, :
    ][target_var]
    # if use case is prediction (which only requires predictors only at the end), 
    # do not remove data at the end
#     else:
#         data_shifted_target = data[target_var]
        
    # create 3D target data based on rolling window
    for window in data_shifted_target.rolling(
        window=window_width_target, closed="both"
    ):
        print("############################################")
        print(window)
        if window.shape[0] == window_target_size_count + 1:
            data_target.append(
                window.values.reshape((1, window_target_size_count + 1, 1))
            )
            
    # reshape data dimension
    data_target = np.concatenate(np.array(data_target), axis=0)

    # combine 3D predictor and target data into dictionary
    data = {}
    data["predictor"] = data_predictor
    data["target"] = data_target
    data["timestamp"] = timestamp
    
    return data

### timelag data processing for trimmed data (deployment scenario)

- `output_test` is data that will go into trained model
- and where target is not needed

In [15]:
if configs["learning_algorithm"]["arch_version"] == "alfa":
    print("ALFA ###########################################################")
    output_test, output_dummy = timelag_predictors(data_test, configs)
    
elif configs["learning_algorithm"]["arch_version"] == "bravo":
    print("BRAVO ###########################################################")
    output_test, output_dummy = timelag_predictors_target(data_test, configs)

elif configs["learning_algorithm"]["arch_version"] == "charlie":
    print("CHARLIE ###########################################################")
    output_test, output_dummy = roll_predictors_target(data_test, configs)
    
output_test

ALFA ###########################################################


Unnamed: 0,var_1_min_lag4,sin_HOD_min_lag4,cos_HOD_min_lag4,HOD_binary_reg_0_min_lag4,HOD_binary_reg_1_min_lag4,HOD_binary_reg_2_min_lag4,HOD_binary_reg_3_min_lag4,HOD_binary_reg_4_min_lag4,HOD_binary_reg_5_min_lag4,HOD_binary_reg_6_min_lag4,...,DOW_binary_reg_6_mean,DOW_binary_fuzzy_0_mean,DOW_binary_fuzzy_1_mean,DOW_binary_fuzzy_2_mean,DOW_binary_fuzzy_3_mean,DOW_binary_fuzzy_4_mean,DOW_binary_fuzzy_5_mean,DOW_binary_fuzzy_6_mean,sin_MOY_mean,cos_MOY_mean
2021-12-06 13:00:00+00:00,12061146.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.497222,0.502778,0.0,0.0,0.0,0.0,0.0,-0.417194,0.908818


### timelag data processing for entire data (training scenario)

- `output_groundtruth` is data that will be used for training the model
- so predictors and target(s) in each row is sort of a groundtruth pair

In [16]:
if configs["learning_algorithm"]["arch_version"] == "alfa":
    print("ALFA ###########################################################")
    output_dummy, output_groundtruth = timelag_predictors(data_ref, configs)
    
elif configs["learning_algorithm"]["arch_version"] == "bravo":
    print("BRAVO ###########################################################")
    output_dummy, output_groundtruth = timelag_predictors_target(data_ref, configs)

elif configs["learning_algorithm"]["arch_version"] == "charlie":
    print("CHARLIE ###########################################################")
    output_dummy, output_groundtruth = roll_predictors_target(data_ref, configs)
    
output_groundtruth

ALFA ###########################################################


Unnamed: 0,var_1_min_lag4,sin_HOD_min_lag4,cos_HOD_min_lag4,HOD_binary_reg_0_min_lag4,HOD_binary_reg_1_min_lag4,HOD_binary_reg_2_min_lag4,HOD_binary_reg_3_min_lag4,HOD_binary_reg_4_min_lag4,HOD_binary_reg_5_min_lag4,HOD_binary_reg_6_min_lag4,...,DOW_binary_fuzzy_0_mean,DOW_binary_fuzzy_1_mean,DOW_binary_fuzzy_2_mean,DOW_binary_fuzzy_3_mean,DOW_binary_fuzzy_4_mean,DOW_binary_fuzzy_5_mean,DOW_binary_fuzzy_6_mean,sin_MOY_mean,cos_MOY_mean,Synthetic Site Electricity Main Total Power
2021-12-01 07:00:00+00:00,,,,,,,,,,,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,-12010730.0
2021-12-01 07:15:00+00:00,,,,,,,,,,,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,-12010745.0
2021-12-01 07:30:00+00:00,,,,,,,,,,,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,-12010800.0
2021-12-01 07:45:00+00:00,,,,,,,,,,,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,-12010815.0
2021-12-01 08:00:00+00:00,12010700.0,0.965926,-2.588190e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.705556,0.294444,0.0,0.0,0.0,-0.493776,0.869589,-12010830.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 07:00:00+00:00,12080546.0,0.998135,-1.608123e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,
2021-12-08 07:15:00+00:00,12080601.0,0.997859,-6.540313e-02,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
2021-12-08 07:30:00+00:00,12080616.0,0.991445,-1.305262e-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
2021-12-08 07:45:00+00:00,12080631.0,0.980785,-1.950903e-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,


### questions

- what is the predictor set that was used (for training) for predicting the `groundtruth_target` value?

In [17]:
groundtruth_target

-12061400

In [18]:
predictors_used_for_training = output_groundtruth.loc[output_groundtruth[target_var]==groundtruth_target, :]
predictors_used_for_training = predictors_used_for_training.loc[:, predictors_used_for_training.columns!=target_var]
predictors_used_for_training

Unnamed: 0,var_1_min_lag4,sin_HOD_min_lag4,cos_HOD_min_lag4,HOD_binary_reg_0_min_lag4,HOD_binary_reg_1_min_lag4,HOD_binary_reg_2_min_lag4,HOD_binary_reg_3_min_lag4,HOD_binary_reg_4_min_lag4,HOD_binary_reg_5_min_lag4,HOD_binary_reg_6_min_lag4,...,DOW_binary_reg_6_mean,DOW_binary_fuzzy_0_mean,DOW_binary_fuzzy_1_mean,DOW_binary_fuzzy_2_mean,DOW_binary_fuzzy_3_mean,DOW_binary_fuzzy_4_mean,DOW_binary_fuzzy_5_mean,DOW_binary_fuzzy_6_mean,sin_MOY_mean,cos_MOY_mean
2021-12-06 13:30:00+00:00,12061216.0,-0.130526,-0.997564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.458333,0.541667,0.0,0.0,0.0,0.0,0.0,-0.417194,0.908818


- what is the predictor set that resulted from current deployment workflow via window calculation?

In [19]:
predictors_used_for_deployment = output_test
predictors_used_for_deployment

Unnamed: 0,var_1_min_lag4,sin_HOD_min_lag4,cos_HOD_min_lag4,HOD_binary_reg_0_min_lag4,HOD_binary_reg_1_min_lag4,HOD_binary_reg_2_min_lag4,HOD_binary_reg_3_min_lag4,HOD_binary_reg_4_min_lag4,HOD_binary_reg_5_min_lag4,HOD_binary_reg_6_min_lag4,...,DOW_binary_reg_6_mean,DOW_binary_fuzzy_0_mean,DOW_binary_fuzzy_1_mean,DOW_binary_fuzzy_2_mean,DOW_binary_fuzzy_3_mean,DOW_binary_fuzzy_4_mean,DOW_binary_fuzzy_5_mean,DOW_binary_fuzzy_6_mean,sin_MOY_mean,cos_MOY_mean
2021-12-06 13:00:00+00:00,12061146.0,-3.216245e-16,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.497222,0.502778,0.0,0.0,0.0,0.0,0.0,-0.417194,0.908818


- are these two predictor sets match with each other?
- aren't they supposed to match?

In [20]:
results = (predictors_used_for_deployment.values.squeeze() == predictors_used_for_training.values.squeeze())
results

array([False, False, False, ...,  True,  True,  True])

- how many TRUEs in there?

In [21]:
results.tolist().count(True)

887

- how many FALSEs in there?

In [22]:
results.tolist().count(False)

118