In [1]:
import numpy as np
import pandas as pd
import pathlib
import random
import time
import math
import os
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import init
from dateutil import parser
from pathlib import Path
import json 
import shutil
import logging
import copy
logger = logging.getLogger(str(os.getpid()))

from wattile.data_reading import read_dataset_from_file
from wattile.buildings_processing import _resample_data, correct_predictor_columns, correct_timestamps, resample_or_rolling_stats, timelag_predictors, timelag_predictors_target, roll_predictors_target, input_data_split, prep_for_rnn, _preprocess_data
from wattile.time_processing import add_processed_time_columns
from wattile.models import ModelFactory
from wattile.entry_point import init_logging, create_input_dataframe, run_model
PROJECT_DIRECTORY = Path().resolve().parent.parent

PROJECT_DIRECTORY = C:\Users\JKIM4\Anaconda3\envs\wattile\Lib\site-packages\wattile


# reading configs

In [2]:
"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
# main configs file
with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
    configs = json.load(f)

exp_dir = PROJECT_DIRECTORY / "notebooks" / "exp_dir1"
if exp_dir.exists():
    shutil.rmtree(exp_dir)
exp_dir.mkdir()

configs["data_output"]["exp_dir"] = str(exp_dir)
configs["data_input"]["data_dir"] = str(PROJECT_DIRECTORY / "tests" / "data" / "Synthetic Site")

configs

{'data_input': {'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\tests\\data\\Synthetic Site',
  'data_config': 'Synthetic Site Config.json',
  'start_time': '2018-01-01T00:00:00-07:00',
  'end_time': '2022-01-01T00:00:00-07:00',
  'predictor_columns': ['Synthetic Weather Station Dew Point Temperature',
   'Synthetic Weather Station Diffuse Horizontal Irradiance',
   'Synthetic Weather Station Direct Normal Irradiance',
   'Synthetic Weather Station Dry Bulb Temperature',
   'Synthetic Weather Station Global Horizontal Irradiance',
   'Synthetic Weather Station Relative Humidity',
   'Synthetic Weather Station Wind Speed'],
  'target_var': 'Synthetic Site Electricity Main Total Power'},
 'data_output': {'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir1',
  'plot_comparison': True,
  'plot_comparison_portion_start': 0.0,
  'plot_comparison_portion_end': 1.0},
 'data_processing': {'feat_time': {'month

# update configs if necessary

In [3]:
configs["learning_algorithm"]["quantiles"] = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
configs["data_processing"]["resample"]["bin_closed"] = "right"
configs["data_processing"]["resample"]["bin_label"] = "left"
configs

{'data_input': {'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\tests\\data\\Synthetic Site',
  'data_config': 'Synthetic Site Config.json',
  'start_time': '2018-01-01T00:00:00-07:00',
  'end_time': '2022-01-01T00:00:00-07:00',
  'predictor_columns': ['Synthetic Weather Station Dew Point Temperature',
   'Synthetic Weather Station Diffuse Horizontal Irradiance',
   'Synthetic Weather Station Direct Normal Irradiance',
   'Synthetic Weather Station Dry Bulb Temperature',
   'Synthetic Weather Station Global Horizontal Irradiance',
   'Synthetic Weather Station Relative Humidity',
   'Synthetic Weather Station Wind Speed'],
  'target_var': 'Synthetic Site Electricity Main Total Power'},
 'data_output': {'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir1',
  'plot_comparison': True,
  'plot_comparison_portion_start': 0.0,
  'plot_comparison_portion_end': 1.0},
 'data_processing': {'feat_time': {'month

# methods to update

In [4]:
def resample_or_rolling_stats(data, configs):

    # reading configuration parameters.
    # default is right labeled and right-closed window.
    # window_position is hard coded for now.
    # default is right-closed and backward-looking window.
    bin_interval = configs["data_processing"]["resample"]["bin_interval"]
    bin_closed = configs["data_processing"]["resample"]["bin_closed"]    
    bin_label = configs["data_processing"]["resample"]["bin_label"]
    window_width = configs["data_processing"]["feat_stats"]["window_width"]
    window_position = "backward"  # forward, center, backward

    if configs["data_processing"]["feat_stats"]["active"]:

        # seperate predictors and target
        target = data[configs["data_input"]["target_var"]]
        X_data = data.drop(configs["data_input"]["target_var"], axis=1)

        # resampling for each statistics separately
        data_resampler = X_data.resample(
            rule=bin_interval, closed=bin_closed, label=bin_label
        )
        data_resample_min = data_resampler.min().add_suffix("_min")
        data_resample_max = data_resampler.max().add_suffix("_max")
        data_resample_sum = data_resampler.sum().add_suffix("_sum")
        data_resample_count = data_resampler.count().add_suffix("_count")
        
        ##############################################################################
        #TEMPORARY
        ##############################################################################
        data_resample_min.loc[:,data_resample_min.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature_min")].to_csv("./output1_after_resample.csv")
#         for index, entry in enumerate(data_resampler):
#             print("entry {} = {}".format(index, entry))

        # setting configuration settings depending on window_position and bin_closed
        if window_position == "backward":
            arg_center = False
        elif window_position == "center":
            arg_center = True
        elif window_position == "forward":
            arg_center = False
            data_resample_min = data_resample_min[::-1]
            data_resample_max = data_resample_max[::-1]
            data_resample_sum = data_resample_sum[::-1]
            data_resample_count = data_resample_count[::-1]
            if bin_closed == "left":
                bin_closed = "right"
            elif bin_closed == "right":
                bin_closed = "left"

        # adding rolling window statistics: minimum
        mins = data_resample_min.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed,
        ).min()

        # adding rolling window statistics: maximum
        maxs = data_resample_max.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed,     
        ).max()

        # adding rolling window statistics: sum
        sums = data_resample_sum.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed,
        ).sum()

        # adding rolling window statistics: count
        counts = data_resample_count.rolling(
            window=window_width,
            min_periods=1,
            center=arg_center,
            closed=bin_closed,
        ).sum()  # this has to be sum for proper count calculation

        # adding rolling window statistics: mean
        means = sums.copy()
        means.columns = means.columns.str.replace("_sum", "_mean")
        np.seterr(invalid="ignore")  # supress/hide the warning
        means.loc[:, :] = sums.values / counts.values

        # combining min and max stats
        data = pd.concat([mins, maxs, means], axis=1)
        
        ##############################################################################
        # TEMPORARY
        ##############################################################################
        mins.loc[:,mins.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature_min")].to_csv("./output2_after_reample_rolling.csv")
        data.to_csv("./output3_after_resample_rolling_completeset.csv")

        # reordering dataframe based on window_position
        if window_position == "forward":
            data = data[::-1]

        # adding resampled target back to the dataframe
        target = _resample_data(target, configs)
        data[configs["data_input"]["target_var"]] = target

    else:

        # resample data
        data = _resample_data(data, configs)
        
#     print("### data after resample_or_rolling_stats = {}".format(data))

    return data

def timelag_predictors(data, configs):
    """
    Create lagged versions of predictor variables in a DataFrame.
    Used specifically for alfa learning methods.
    :param data: (DataFrame)
    :param configs: (Dict)
    :return: (DataFrame)
    """

    # reading configuration parameters
    lag_interval = configs["data_processing"]["feat_timelag"]["lag_interval"]
    lag_count = configs["data_processing"]["feat_timelag"]["lag_count"]
    window_width_futurecast = configs["data_processing"]["input_output_window"][
        "window_width_futurecast"
    ]
    target_var = configs["data_input"]["target_var"]

    # splitting predictors and target
    target = data[target_var]
    data = data.drop(target_var, axis=1)
    data_orig = data

    # padding predictors
    temp_holder = list()
    temp_holder.append(data_orig)
    for i in range(1, lag_count + 1):
        shifted = (
            data_orig.shift(freq=i * lag_interval)
            .astype("float32")
            .add_suffix("_lag{}".format(i))
        )
        temp_holder.append(shifted)
    temp_holder.reverse()
    data = pd.concat(temp_holder, axis=1)

    if configs["learning_algorithm"]["use_case"] != "prediction":
        data[target_var] = target.shift(freq="-" + window_width_futurecast)
    else:
        data[target_var] = 0  # dummy
        ##############################################################################
        #TEMPORARY
        ##############################################################################
        temp = copy.deepcopy(data)
        temp = temp.loc[:, temp.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature_min")]
        temp.to_csv("./output5.csv")

    data = data.dropna(how="any")

    return data

def _preprocess_data(configs, data):
    """Preprocess data as dictated by the configs.
    :param configs: configs
    :type configs: dict
    :param data: data
    :type data: pd.dataframe
    :return: data
    :rtype: pd.dataframe
    """
    # assert we have the correct columns and order them
    data = correct_predictor_columns(configs, data)

    # sort and trim data specified time period
    data = correct_timestamps(configs, data)

    # Add time-based features
    data = add_processed_time_columns(data, configs)
    
    ##############################################################################
    #TEMPORARY
    ##############################################################################
    data.loc[:,data.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature")].to_csv("./output0_before_resample_rolling.csv")

    # Add statistics features
    data = resample_or_rolling_stats(data, configs)
    
    ##############################################################################
    #TEMPORARY
    ##############################################################################
    data.loc[:,data.columns.str.contains("Synthetic Weather Station Dry Bulb Temperature_min")].to_csv("./output4.csv")

    # Add lag features
    configs["input_dim"] = data.shape[1] - 1
    logger.info("Number of features: {}".format(configs["input_dim"]))
    logger.debug("Features: {}".format(data.columns.values))

    if configs["learning_algorithm"]["arch_version"] == "alfa":
        data = timelag_predictors(data, configs)
    elif configs["learning_algorithm"]["arch_version"] == "bravo":
        data = timelag_predictors_target(data, configs)
    elif configs["learning_algorithm"]["arch_version"] == "charlie":
        data = roll_predictors_target(data, configs)

    return data

In [5]:
def get_input_window_for_output_time(datetime):
    """Given the time for which we want to predict, return the time window of the required
    input.
    :param output_time: the time for which we want to predict
    :type output_time: datatime
    :return: earliest time input should include, latest time input should include.
    :rtype: dt.datetime, datetime
    """

    # set prediction time with pandas timedelta
    timestamp_cast = pd.to_datetime(datetime)  # current time needs to go in here

    # set parameters
    config_data_processing = configs["data_processing"]
    lag_interval = config_data_processing["feat_timelag"]["lag_interval"]
    lag_count = config_data_processing["feat_timelag"]["lag_count"]
    bin_interval = config_data_processing["resample"]["bin_interval"]
    bin_label = config_data_processing["resample"]["bin_label"]

    # calculating offsets
    window_offset = pd.Timedelta(lag_interval) * lag_count

    # calculating start and end time windows for input data
#     if bin_label == "left":
#         prediction_window_start_time = timestamp_cast - window_offset + pd.Timedelta(bin_interval)
#         prediction_window_end_time = timestamp_cast + pd.Timedelta(bin_interval)
#     elif bin_label == "right":
    prediction_window_start_time = timestamp_cast - window_offset
    prediction_window_end_time = timestamp_cast

    return prediction_window_start_time, prediction_window_end_time

# train

In [8]:
################################################################
# create results folder
init_logging(local_results_dir=configs["data_output"]["exp_dir"])

################################################################
# read data

# data source 1
# data = read_dataset_from_file(configs)

# data source 2
data = pd.read_csv("../../tests/fixtures/data_edge_consideration.csv", index_col=0)

# ################################################################
# # prepare data for training
# # train_df, val_df = prep_for_rnn(configs, data)

# data = _preprocess_data(configs, data)

# # if validatate with external data, write data to h5 for future testing.
# if (
#     configs["learning_algorithm"]["use_case"] == "validation"
#     and configs["learning_algorithm"]["test_method"] == "external"
# ):
#     filepath = pathlib.Path(
#         configs["data_input"]["data_dir"]
#     ) / "{}_external_test.h5".format(configs["data_input"]["target_var"])
#     data.to_hdf(filepath, key="df", mode="w")

# if configs["learning_algorithm"]["use_case"] == "train":
#     train_df, val_df = input_data_split(data, configs)

# else:
#     train_df, val_df = pd.DataFrame(), data

# ################################################################
# # create model
# model = ModelFactory.create_model(configs)

# ################################################################
# # train model
# results = model.train(train_df, val_df)

Logging to: C:\Users\JKIM4\Documents\GitHub\intelligentcampus-pred-analytics\notebooks\exp_dir1\output.out, PID: 5856


# predict

### load model config with use case prediction

In [None]:
configs["learning_algorithm"]["use_case"] = "prediction"
configs

### instantiate model

In [None]:
# create model
model = ModelFactory.create_model(configs)

### determine read time span for prediction

In [None]:
time_casting = "2021-12-07 07:00:00"

In [None]:
# prediction_window_start_time, prediction_window_end_time = model.get_input_window_for_output_time(time_casting)
input_start, input_end = get_input_window_for_output_time(time_casting)

In [None]:
input_start

In [None]:
input_end

### read and prepare data

In [None]:
# read data
data = read_dataset_from_file(configs)
data = data[input_start:input_end]
data

### adjust model config start/end times

In [None]:
configs['data_input']['start_time'] = input_start.isoformat()
configs['data_input']['end_time'] = input_end.isoformat()

### load data

In [None]:
# predictor_grid = data
# predictor_data_frame = predictor_grid.to_dataframe()
# predictor_data_frame = predictor_data_frame.set_index('ts')

###  rename columns

In [None]:
data[configs['data_input']['target_var']] = -999
data

### prep data

In [None]:
# prepare data for training
# train_df, val_df = prep_for_rnn(configs, data)

data = _preprocess_data(configs, data)

# if validatate with external data, write data to h5 for future testing.
if (
    configs["learning_algorithm"]["use_case"] == "validation"
    and configs["learning_algorithm"]["test_method"] == "external"
):
    filepath = pathlib.Path(
        configs["data_input"]["data_dir"]
    ) / "{}_external_test.h5".format(configs["data_input"]["target_var"])
    data.to_hdf(filepath, key="df", mode="w")

if configs["learning_algorithm"]["use_case"] == "train":
    train_df, val_df = input_data_split(data, configs)

else:
    train_df, val_df = pd.DataFrame(), data

In [None]:
train_df

In [None]:
val_df

### predict with (trained) model

In [None]:
# train model
model.predict(val_df)