In [1]:
import numpy as np
import pandas as pd
import pathlib
import random
import os
from pathlib import Path
import json 
import shutil
import logging
import copy
logger = logging.getLogger(str(os.getpid()))

from wattile.data_reading import read_dataset_from_file
from wattile.buildings_processing import _resample_data, correct_predictor_columns, correct_timestamps, resample_or_rolling_stats, timelag_predictors, timelag_predictors_target, roll_predictors_target, input_data_split, prep_for_rnn, _preprocess_data
from wattile.time_processing import add_processed_time_columns
from wattile.models import ModelFactory
from wattile.entry_point import init_logging, create_input_dataframe, run_model
PROJECT_DIRECTORY = Path().resolve().parent.parent

PROJECT_DIRECTORY = C:\Users\JKIM4\Anaconda3\envs\wattile\Lib\site-packages\wattile


# reading configs

In [2]:
"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
# main configs file
with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
    configs = json.load(f)

exp_dir = PROJECT_DIRECTORY / "notebooks" / "exp_dir"
if exp_dir.exists():
    shutil.rmtree(exp_dir)
exp_dir.mkdir()

configs["data_output"]["exp_dir"] = str(exp_dir)
configs["data_input"]["data_dir"] = str(PROJECT_DIRECTORY / "tests" / "data" / "Synthetic Site")

configs

{'data_input': {'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\tests\\data\\Synthetic Site',
  'data_config': 'Synthetic Site Config.json',
  'start_time': '2018-01-01T00:00:00-07:00',
  'end_time': '2022-01-01T00:00:00-07:00',
  'predictor_columns': ['Synthetic Weather Station Dew Point Temperature',
   'Synthetic Weather Station Diffuse Horizontal Irradiance',
   'Synthetic Weather Station Direct Normal Irradiance',
   'Synthetic Weather Station Dry Bulb Temperature',
   'Synthetic Weather Station Global Horizontal Irradiance',
   'Synthetic Weather Station Relative Humidity',
   'Synthetic Weather Station Wind Speed'],
  'target_var': 'Synthetic Site Electricity Main Total Power'},
 'data_output': {'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir',
  'plot_comparison': True,
  'plot_comparison_portion_start': 0.0,
  'plot_comparison_portion_end': 1.0},
 'data_processing': {'feat_time': {'month_

# which data to use for testing?

In [3]:
datatype = "incomplete" # complete/incomplete

# update configs if necessary

In [4]:
configs["learning_algorithm"]["quantiles"] = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
configs["data_processing"]["resample"]["bin_closed"] = "left"
configs["data_processing"]["resample"]["bin_label"] = "left"
configs["data_input"]["predictor_columns"] = []
configs["learning_algorithm"]["num_epochs"] = 50
configs["data_processing"]["feat_time"]["day_of_week"] = ["binary_reg"]
configs["data_processing"]["feat_time"]["hour_of_day"] = ["sincos"]
configs["data_processing"]["feat_timelag"]["lag_count"] = 12
configs["data_processing"]["feat_stats"]["window_width"] = "15min"
configs["data_processing"]["input_output_window"]["window_width_source"] = '180min'
configs["data_processing"]["input_output_window"]["window_width_futurecast"] = '15min'
configs["data_processing"]["input_output_window"]["window_width_target"] = '45min'
if datatype == "incomplete":
    configs["data_input"]["target_var"] = "target"

configs

{'data_input': {'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\tests\\data\\Synthetic Site',
  'data_config': 'Synthetic Site Config.json',
  'start_time': '2018-01-01T00:00:00-07:00',
  'end_time': '2022-01-01T00:00:00-07:00',
  'predictor_columns': [],
  'target_var': 'target'},
 'data_output': {'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir',
  'plot_comparison': True,
  'plot_comparison_portion_start': 0.0,
  'plot_comparison_portion_end': 1.0},
 'data_processing': {'feat_time': {'month_of_year': ['sincos'],
   'day_of_week': ['binary_reg'],
   'hour_of_day': ['sincos'],
   'holidays': False},
  'resample': {'bin_interval': '15min',
   'bin_closed': 'left',
   'bin_label': 'left'},
  'feat_stats': {'active': True, 'window_width': '15min'},
  'feat_timelag': {'lag_interval': '15min', 'lag_count': 12},
  'input_output_window': {'window_width_source': '180min',
   'window_width_futurecast': '15

# methods to update

In [5]:
def resample_or_rolling_stats(data, configs):

    # reading configuration parameters.
    # default is right labeled and right-closed window.
    # window_position is hard coded for now.
    # default is right-closed and backward-looking window.
    bin_interval = configs["data_processing"]["resample"]["bin_interval"]
    bin_closed = configs["data_processing"]["resample"]["bin_closed"]    
    bin_label = configs["data_processing"]["resample"]["bin_label"]
    window_width = configs["data_processing"]["feat_stats"]["window_width"]
    window_position = "backward"  # forward, center, backward
    
    ##############################################################################
    # ATTENTION
    """
    - before, 'bin_closed' was also applied to 'closed' arg of 'rolling' method
    - after, 'closed' arg in 'rolling' method is fixed to 'right'
    - reasoning: 'closed' arg in 'rolling' method is behaving differently than 'closed' arg in 'resample' method
    """
    ##############################################################################
    bin_closed_rolling = "right"

    if configs["data_processing"]["feat_stats"]["active"]:

        # seperate predictors and target
        target = data[configs["data_input"]["target_var"]]
        X_data = data.drop(configs["data_input"]["target_var"], axis=1)

        # resampling for each statistics separately
        data_resampler = X_data.resample(
            rule=bin_interval, closed=bin_closed, label=bin_label
        )
        data_resample_min = data_resampler.min().add_suffix("_min")
        data_resample_max = data_resampler.max().add_suffix("_max")
        data_resample_sum = data_resampler.sum().add_suffix("_sum")
        data_resample_count = data_resampler.count().add_suffix("_count")
        
        #TEMPORARY
        if datatype == "incomplete":
            data_resample_min.loc[:,data_resample_min.columns.str.contains("var0")].to_csv("./output0_1_resampled_min.csv")
            data_resample_max.loc[:,data_resample_max.columns.str.contains("var0")].to_csv("./output0_2_resampled_max.csv")
            data_resample_sum.loc[:,data_resample_sum.columns.str.contains("var0")].to_csv("./output0_3_resampled_sum.csv")
            data_resample_count.loc[:,data_resample_count.columns.str.contains("var0")].to_csv("./output0_4_resampled_cnt.csv")
        elif datatype == "complete":
            data_resample_min.loc[:,data_resample_min.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature")].to_csv("./output0_1_resampled_min.csv")
            data_resample_max.loc[:,data_resample_max.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature")].to_csv("./output0_2_resampled_max.csv")

        # setting configuration settings depending on window_position and bin_closed
        if window_position == "backward":
            arg_center = False
        elif window_position == "center":
            arg_center = True
        elif window_position == "forward":
            arg_center = False
            data_resample_min = data_resample_min[::-1]
            data_resample_max = data_resample_max[::-1]
            data_resample_sum = data_resample_sum[::-1]
            data_resample_count = data_resample_count[::-1]
            if bin_closed == "left":
                bin_closed = "right"
            elif bin_closed == "right":
                bin_closed = "left"

        # adding rolling window statistics: minimum
        mins = data_resample_min.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed_rolling,
        ).min()

        # adding rolling window statistics: maximum
        maxs = data_resample_max.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed_rolling,     
        ).max()
        
        #TEMPORARY
        for count, item in enumerate(data_resample_max.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed_rolling,
        )):
            print("-----------------------------------")
            print("WINDOW COUNT = {}:".format(count))
            if datatype == "incomplete":
                print("WINDOW ENTRY =  {}:".format(item["var0_max"]))
                print("WINDOW.max() = {}".format(item["var0_max"].max()))
            elif datatype == "complete":
                print("WINDOW ENTRY =  {}:".format(item["Synthetic Weather Station Dry Bulb Temperature_max"]))
                print("WINDOW.max() = {}".format(item["Synthetic Weather Station Dry Bulb Temperature_max"].max()))

        # adding rolling window statistics: sum
        sums = data_resample_sum.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed_rolling,
        ).sum()

        # adding rolling window statistics: count
        counts = data_resample_count.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed_rolling,
        ).sum()  # this has to be sum for proper count calculation

        # adding rolling window statistics: mean
        means = sums.copy()
        means.columns = means.columns.str.replace("_sum", "_mean")
        np.seterr(invalid="ignore")  # supress/hide the warning
        means.loc[:, :] = sums.values / counts.values

        # combining min and max stats
        data = pd.concat([mins, maxs, means], axis=1)

        # reordering dataframe based on window_position
        if window_position == "forward":
            data = data[::-1]

        # adding resampled target back to the dataframe
        target = _resample_data(target, configs)
        data[configs["data_input"]["target_var"]] = target

    else:

        # resample data
        data = _resample_data(data, configs)
        
#     print("### data after resample_or_rolling_stats = {}".format(data))

    return data

def timelag_predictors(data, configs):
    """
    Create lagged versions of predictor variables in a DataFrame.
    Used specifically for alfa learning methods.
    :param data: (DataFrame)
    :param configs: (Dict)
    :return: (DataFrame)
    """

    # reading configuration parameters
    bin_interval = configs["data_processing"]["resample"]["bin_interval"]
    bin_label = configs["data_processing"]["resample"]["bin_label"]
    lag_interval = configs["data_processing"]["feat_timelag"]["lag_interval"]
    lag_count = configs["data_processing"]["feat_timelag"]["lag_count"]
    window_width_futurecast = configs["data_processing"]["input_output_window"][
        "window_width_futurecast"
    ]
    target_var = configs["data_input"]["target_var"]

    # splitting predictors and target
    target = data[target_var]
    data = data.drop(target_var, axis=1)
    data_orig = data

    # padding predictors
    temp_holder = list()
    temp_holder.append(data_orig)
    for i in range(1, lag_count + 1):
        shifted = (
            data_orig.shift(freq=i * lag_interval)
            .astype("float32")
            .add_suffix("_lag{}".format(i))
        )
        temp_holder.append(shifted)
    temp_holder.reverse()
    data = pd.concat(temp_holder, axis=1)

    ##############################################################################
    # ATTENTION
    """
    - before, 
      - same target shift was applied for both left-labeled and right-labeled data.
      - no shift was done on timestamp
    - after, 
      - target shift has to be done one more (wrt bin_interval) for left-labeled data. and no additional shit for right-labeled data.
      - timestamp shift has to be done one more (wrt bin_interval) for left_labeled data. and no additional shit for right-labeled data.
    - reasoning: 
    """
    ##############################################################################
    if configs["learning_algorithm"]["use_case"] != "prediction":
        if bin_label == "left":
            # data[target_var] = target.shift(freq=-pd.Timedelta(window_width_futurecast)-pd.Timedelta(bin_interval))
            data[target_var] = target.shift(freq=-pd.Timedelta(window_width_futurecast))
        elif bin_label == "right":
            data[target_var] = target.shift(freq=-pd.Timedelta(window_width_futurecast))
    else:
        #data[target_var] = 0  # dummy
        if bin_label == "left":
            data.index = data.index.shift(freq=configs["data_processing"]["resample"]["bin_interval"])
        
        #TEMPORARY
        if bin_label == "left":
            data[target_var] = target.shift(freq=-pd.Timedelta(window_width_futurecast))
            data[target_var] = data[target_var].fillna(0)
        elif bin_label == "right":
            data[target_var] = target.shift(freq=-pd.Timedelta(window_width_futurecast))
            data[target_var] = data[target_var].fillna(0)
        temp = copy.deepcopy(data)
        if datatype == "incomplete":
            temp = temp.loc[:, temp.columns.str.contains("var0_max|{}".format(target_var))]
        elif datatype == "complete":
            temp = temp.loc[:, temp.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature_max|{}".format(target_var))]
        temp.to_csv("./output2_timelag_predictors.csv")

    data = data.dropna(how="any")

    return data

def _preprocess_data(configs, data):
    """Preprocess data as dictated by the configs.
    :param configs: configs
    :type configs: dict
    :param data: data
    :type data: pd.dataframe
    :return: data
    :rtype: pd.dataframe
    """
    # assert we have the correct columns and order them
    # data = correct_predictor_columns(configs, data)

    # sort and trim data specified time period
    data = correct_timestamps(configs, data)

    # Add time-based features
    data = add_processed_time_columns(data, configs)
    
    #TEMPORARY
    if datatype == "incomplete":
        data.loc[:,data.columns.str.contains("var0")].to_csv("./output0_before_resample_rolling.csv")
    elif datatype == "complete":
        data.loc[:,data.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature")].to_csv("./output0_before_resample_rolling.csv")

    # Add statistics features
    data = resample_or_rolling_stats(data, configs)
    
    #TEMPORARY
    if datatype == "incomplete":
        data.loc[:,data.columns.str.contains("var0|{}".format(configs["data_input"]["target_var"]))].to_csv("./output1_rolling_stats.csv")
    elif datatype == "complete":
        data.loc[:,data.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature")].to_csv("./output1_rolling_stats.csv")
        
    # Add lag features
    configs["input_dim"] = data.shape[1] - 1
    logger.info("Number of features: {}".format(configs["input_dim"]))
    logger.debug("Features: {}".format(data.columns.values))

    if configs["learning_algorithm"]["arch_version"] == "alfa":
        data = timelag_predictors(data, configs)
    elif configs["learning_algorithm"]["arch_version"] == "bravo":
        data = timelag_predictors_target(data, configs)
    elif configs["learning_algorithm"]["arch_version"] == "charlie":
        data = roll_predictors_target(data, configs)

    return data

In [6]:
def get_input_window_for_output_time(datetime):
    """Given the time for which we want to predict, return the time window of the required
    input.
    :param output_time: the time for which we want to predict
    :type output_time: datatime
    :return: earliest time input should include, latest time input should include.
    :rtype: dt.datetime, datetime
    """

    # set prediction time with pandas timedelta
    timestamp_cast = pd.to_datetime(datetime)  # current time needs to go in here

    # set parameters
    config_data_processing = configs["data_processing"]
    lag_interval = config_data_processing["feat_timelag"]["lag_interval"]
    lag_count = config_data_processing["feat_timelag"]["lag_count"]

    ##############################################################################
    # ATTENTION
    """
    - before, 'window_offset' was defined as 'pd.Timedelta(lag_interval) * lag_count'
    - after, 'window_offset' is now defined as 'pd.Timedelta(lag_interval) * (lag_count+1)'
    - reasoning: this is to grab correct amount of time/span based on 'feat_stats' parameters
    """
    ##############################################################################
    # calculating offsets
    window_offset = pd.Timedelta(lag_interval) * (lag_count+1)

    # calculating start and end time windows for input data
    prediction_window_start_time = timestamp_cast - window_offset
    prediction_window_end_time = timestamp_cast

    return prediction_window_start_time, prediction_window_end_time

# train

In [7]:
################################################################
# create results folder
init_logging(local_results_dir=configs["data_output"]["exp_dir"])

################################################################
# read data

if datatype == "complete":
    data = read_dataset_from_file(configs)
elif datatype == "incomplete":
    data = pd.read_csv("../../tests/fixtures/data_edge_consideration.csv", index_col=0)
#     data = pd.read_csv("../../tests/fixtures/data_edge_consideration_temp.csv", index_col=0)
    data.index = pd.to_datetime(data.index)

################################################################
# prepare data for training
data = _preprocess_data(configs, data)

# if validatate with external data, write data to h5 for future testing.
if (
    configs["learning_algorithm"]["use_case"] == "validation"
    and configs["learning_algorithm"]["test_method"] == "external"
):
    filepath = pathlib.Path(
        configs["data_input"]["data_dir"]
    ) / "{}_external_test.h5".format(configs["data_input"]["target_var"])
    data.to_hdf(filepath, key="df", mode="w")

if configs["learning_algorithm"]["use_case"] == "train":
    train_df, val_df = input_data_split(data, configs)

else:
    train_df, val_df = pd.DataFrame(), data
    
################################################################
# create model
model = ModelFactory.create_model(configs)

################################################################
# train model
results = model.train(train_df, val_df)

Logging to: C:\Users\JKIM4\Documents\GitHub\intelligentcampus-pred-analytics\notebooks\exp_dir\output.out, PID: 17660


  data = data[start_time:end_time]


-----------------------------------
WINDOW COUNT = 0:
WINDOW ENTRY =  timestamp
2019-01-01    2.368477
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 2.3684766222413938
-----------------------------------
WINDOW COUNT = 1:
WINDOW ENTRY =  timestamp
2019-01-01 00:15:00    2.515528
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 2.5155279925552767
-----------------------------------
WINDOW COUNT = 2:
WINDOW ENTRY =  timestamp
2019-01-01 00:30:00    2.756843
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 2.7568426370654704
-----------------------------------
WINDOW COUNT = 3:
WINDOW ENTRY =  timestamp
2019-01-01 00:45:00    3.049268
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 3.0492676955800517
-----------------------------------
WINDOW COUNT = 4:
WINDOW ENTRY =  timestamp
2019-01-01 01:00:00    3.206282
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 3.206282045975033
-----------------------------------
WINDOW COUNT = 5:
WINDOW ENTR

  mid_train_error_stats = mid_train_error_stats.append(
  mid_train_error_stats = mid_train_error_stats.append(
  mid_train_error_stats = mid_train_error_stats.append(


saving timeseries comparison in C:\Users\JKIM4\Documents\GitHub\intelligentcampus-pred-analytics\notebooks\exp_dir/Vis_TimeseriesComparisons.svg


In [8]:
data

Unnamed: 0_level_0,var0_min_lag12,var1_min_lag12,var2_min_lag12,var3_min_lag12,sin_HOD_min_lag12,cos_HOD_min_lag12,DOW_binary_reg_0_min_lag12,DOW_binary_reg_1_min_lag12,DOW_binary_reg_2_min_lag12,DOW_binary_reg_3_min_lag12,...,DOW_binary_reg_0_mean,DOW_binary_reg_1_mean,DOW_binary_reg_2_mean,DOW_binary_reg_3_mean,DOW_binary_reg_4_mean,DOW_binary_reg_5_mean,DOW_binary_reg_6_mean,sin_MOY_mean,cos_MOY_mean,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 03:00:00,1.886022,0.022628,0.265147,2.731213,0.011781,0.998068,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.017213,0.999852,7.648631
2019-01-01 03:15:00,2.200530,0.116313,0.247696,2.586154,0.067290,0.991876,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.017213,0.999852,7.810616
2019-01-01 03:30:00,2.391649,0.077990,0.187892,2.898776,0.139965,0.980842,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.017213,0.999852,8.012084
2019-01-01 03:45:00,2.637591,0.142546,0.132785,2.957217,0.204994,0.968020,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.017213,0.999852,8.538523
2019-01-01 04:00:00,2.807024,0.121482,0.223804,3.165427,0.261136,0.950380,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.017213,0.999852,8.911974
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-01-07 22:30:00,0.750924,0.451136,2.343159,-0.131404,-0.923434,0.383758,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120208,0.992749,1.816050
2019-01-07 22:45:00,0.692020,0.423232,2.193490,0.025649,-0.892258,0.451527,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120208,0.992749,1.967433
2019-01-07 23:00:00,0.669374,0.279260,2.167021,0.171936,-0.865115,0.501574,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120208,0.992749,2.045612
2019-01-07 23:15:00,0.669108,0.345039,2.019251,0.558454,-0.828142,0.560519,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120208,0.992749,2.317935


# predict

### load model config with use case prediction

In [9]:
configs["learning_algorithm"]["use_case"] = "prediction"
configs

{'data_input': {'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\tests\\data\\Synthetic Site',
  'data_config': 'Synthetic Site Config.json',
  'start_time': '2018-01-01T00:00:00-07:00',
  'end_time': '2022-01-01T00:00:00-07:00',
  'predictor_columns': [],
  'target_var': 'target'},
 'data_output': {'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir',
  'plot_comparison': True,
  'plot_comparison_portion_start': 0.0,
  'plot_comparison_portion_end': 1.0},
 'data_processing': {'feat_time': {'month_of_year': ['sincos'],
   'day_of_week': ['binary_reg'],
   'hour_of_day': ['sincos'],
   'holidays': False},
  'resample': {'bin_interval': '15min',
   'bin_closed': 'left',
   'bin_label': 'left'},
  'feat_stats': {'active': True, 'window_width': '15min'},
  'feat_timelag': {'lag_interval': '15min', 'lag_count': 12},
  'input_output_window': {'window_width_source': '180min',
   'window_width_futurecast': '15

### instantiate model

In [10]:
# create model
model = ModelFactory.create_model(configs)

### determine read time span for prediction

In [11]:
if datatype == "incomplete":
    time_casting = "2019-01-05 01:00:00"
elif datatype == "complete":
    time_casting = "2021-12-05 01:00:00"

# 2019-01-30 23:45:00

In [12]:
# prediction_window_start_time, prediction_window_end_time = model.get_input_window_for_output_time(time_casting)
input_start, input_end = get_input_window_for_output_time(time_casting)

In [13]:
input_start

Timestamp('2019-01-04 21:45:00')

In [14]:
input_end

Timestamp('2019-01-05 01:00:00')

### read and prepare data

In [15]:
if datatype == "complete":
    data = read_dataset_from_file(configs)
    if configs["data_processing"]["resample"]["bin_closed"]=="left":
        data = data.loc[(input_start.tz_localize('utc')<=data.index)&(data.index<input_end.tz_localize('utc')),:]
    elif configs["data_processing"]["resample"]["bin_closed"]=="right":
        data = data.loc[(input_start.tz_localize('utc')<data.index)&(data.index<=input_end.tz_localize('utc')),:]

elif datatype == "incomplete":
    data = pd.read_csv("../../tests/fixtures/data_edge_consideration.csv", index_col=0)
#     data = pd.read_csv("../../tests/fixtures/data_edge_consideration_temp.csv", index_col=0)
    data.index = pd.to_datetime(data.index).tz_localize('utc')
    # data = data[input_start:input_end]
    
    ##############################################################################
    # ATTENTION
    """
    - this part isn't part of wattile code but necessary for excuting prediction
    - before, data was sliced by data[input_start:input_end]
    - after, data is sliced differently depending on 'bin_closed'
    - reasoning: I'm assuming this is what happens in data acquisition system with different 'bin_closed' setting
    """
    ##############################################################################
    if configs["data_processing"]["resample"]["bin_closed"]=="left":
        data = data.loc[(input_start.tz_localize('utc')<=data.index)&(data.index<input_end.tz_localize('utc')),:]
    elif configs["data_processing"]["resample"]["bin_closed"]=="right":
        data = data.loc[(input_start.tz_localize('utc')<data.index)&(data.index<=input_end.tz_localize('utc')),:]

data

Unnamed: 0_level_0,var0,var1,var2,var3,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-04 21:45:17+00:00,0.936737,0.380942,1.481719,1.397177,1.467386
2019-01-04 21:46:01+00:00,0.906392,0.446748,1.343842,1.142792,1.361162
2019-01-04 21:46:12+00:00,0.894471,0.271562,1.416621,1.158426,1.320218
2019-01-04 21:46:14+00:00,1.015981,0.293512,1.414766,0.850129,1.433782
2019-01-04 21:46:48+00:00,1.125003,0.437931,1.170511,1.409909,1.427087
...,...,...,...,...,...
2019-01-05 00:52:44+00:00,2.851168,0.428192,0.400793,3.368019,3.876519
2019-01-05 00:54:06+00:00,2.973314,0.286126,0.894512,3.970762,3.906376
2019-01-05 00:55:17+00:00,2.991580,0.360029,0.672291,4.201228,3.909430
2019-01-05 00:56:31+00:00,2.814700,0.332980,0.813674,4.006887,3.931459


### adjust model config start/end times

In [16]:
configs['data_input']['start_time'] = input_start.isoformat()
configs['data_input']['end_time'] = input_end.isoformat()

### load data

In [17]:
# predictor_grid = data
# predictor_data_frame = predictor_grid.to_dataframe()
# predictor_data_frame = predictor_data_frame.set_index('ts')

###  rename columns

In [18]:
# data[configs['data_input']['target_var']] = -999
# data

### prep data

In [19]:
# prepare data for training
data = _preprocess_data(configs, data)

# if validatate with external data, write data to h5 for future testing.
if (
    configs["learning_algorithm"]["use_case"] == "validation"
    and configs["learning_algorithm"]["test_method"] == "external"
):
    filepath = pathlib.Path(
        configs["data_input"]["data_dir"]
    ) / "{}_external_test.h5".format(configs["data_input"]["target_var"])
    data.to_hdf(filepath, key="df", mode="w")

if configs["learning_algorithm"]["use_case"] == "train":
    train_df, val_df = input_data_split(data, configs)

else:
    train_df, val_df = pd.DataFrame(), data


Indexing a timezone-aware DatetimeIndex with a timezone-naive datetime is deprecated and will raise KeyError in a future version. Use a timezone-aware object instead.



-----------------------------------
WINDOW COUNT = 0:
WINDOW ENTRY =  timestamp
2019-01-04 21:45:00+00:00    1.125003
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 1.1250030267612905
-----------------------------------
WINDOW COUNT = 1:
WINDOW ENTRY =  timestamp
2019-01-04 22:00:00+00:00    1.224334
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 1.224334014844361
-----------------------------------
WINDOW COUNT = 2:
WINDOW ENTRY =  timestamp
2019-01-04 22:15:00+00:00    1.251733
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 1.2517333413988472
-----------------------------------
WINDOW COUNT = 3:
WINDOW ENTRY =  timestamp
2019-01-04 22:30:00+00:00    1.464868
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 1.4648683548338073
-----------------------------------
WINDOW COUNT = 4:
WINDOW ENTRY =  timestamp
2019-01-04 22:45:00+00:00    1.517992
Freq: 15T, Name: var0_max, dtype: float64:
WINDOW.max() = 1.517992189995914
---------------------------

In [20]:
data

Unnamed: 0_level_0,var0_min_lag12,var1_min_lag12,var2_min_lag12,var3_min_lag12,sin_HOD_min_lag12,cos_HOD_min_lag12,DOW_binary_reg_0_min_lag12,DOW_binary_reg_1_min_lag12,DOW_binary_reg_2_min_lag12,DOW_binary_reg_3_min_lag12,...,DOW_binary_reg_0_mean,DOW_binary_reg_1_mean,DOW_binary_reg_2_mean,DOW_binary_reg_3_mean,DOW_binary_reg_4_mean,DOW_binary_reg_5_mean,DOW_binary_reg_6_mean,sin_MOY_mean,cos_MOY_mean,target
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-05 01:00:00+00:00,0.798564,0.204846,1.084963,0.850129,-0.554542,0.832156,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.085965,0.996298,0.0


### predict with (trained) model

In [21]:
# train model
model.predict(val_df)