In [1]:
import json 
from pathlib import Path
import shutil

from wattile.entry_point import init_logging
from wattile.data_reading import read_dataset_from_file
from wattile.buildings_processing import prep_for_rnn, correct_predictor_columns, correct_timestamps, resample_or_rolling_stats, timelag_predictors, timelag_predictors_target, roll_predictors_target, input_data_split
from wattile.models import ModelFactory
from wattile.time_processing import add_processed_time_columns

PROJECT_DIRECTORY = Path().resolve().parents[1]

PROJECT_DIRECTORY = C:\Users\JKIM4\Anaconda3\envs\wattile\Lib\site-packages\wattile


In [2]:
import os
import logging
logger = logging.getLogger(str(os.getpid()))

# read configs

In [3]:
"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
    configs = json.load(f)

exp_dir = PROJECT_DIRECTORY / "notebooks" / "exp_dir"
if exp_dir.exists():
    shutil.rmtree(exp_dir)
exp_dir.mkdir()

configs["data_output"]["exp_dir"] = str(exp_dir)
configs["data_input"]["data_dir"] = str(PROJECT_DIRECTORY / "tests" / "data" / "Synthetic Site")

configs

{'data_input': {'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\tests\\data\\Synthetic Site',
  'data_config': 'Synthetic Site Config.json',
  'start_time': '2018-01-01T00:00:00-07:00',
  'end_time': '2022-01-01T00:00:00-07:00',
  'predictor_columns': ['Synthetic Weather Station Dew Point Temperature',
   'Synthetic Weather Station Diffuse Horizontal Irradiance',
   'Synthetic Weather Station Direct Normal Irradiance',
   'Synthetic Weather Station Dry Bulb Temperature',
   'Synthetic Weather Station Global Horizontal Irradiance',
   'Synthetic Weather Station Relative Humidity',
   'Synthetic Weather Station Wind Speed'],
  'target_var': 'Synthetic Site Electricity Main Total Power'},
 'data_output': {'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir',
  'plot_comparison': True,
  'plot_comparison_portion_start': 0.0,
  'plot_comparison_portion_end': 1.0},
 'data_processing': {'feat_time': {'month_

# change configs for testing

In [4]:
configs["data_input"]["predictor_columns"] = []
configs

{'data_input': {'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\tests\\data\\Synthetic Site',
  'data_config': 'Synthetic Site Config.json',
  'start_time': '2018-01-01T00:00:00-07:00',
  'end_time': '2022-01-01T00:00:00-07:00',
  'predictor_columns': [],
  'target_var': 'Synthetic Site Electricity Main Total Power'},
 'data_output': {'exp_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics\\notebooks\\exp_dir',
  'plot_comparison': True,
  'plot_comparison_portion_start': 0.0,
  'plot_comparison_portion_end': 1.0},
 'data_processing': {'feat_time': {'month_of_year': ['sincos'],
   'day_of_week': ['binary_reg', 'binary_fuzzy'],
   'hour_of_day': ['sincos', 'binary_reg', 'binary_fuzzy'],
   'holidays': False},
  'resample': {'bin_interval': '15min',
   'bin_closed': 'right',
   'bin_label': 'right'},
  'feat_stats': {'active': True, 'window_width': '15min'},
  'feat_timelag': {'lag_interval': '15min', 'lag_count': 24},
  'inpu

# read data

In [5]:
"""
Firstly, we will read the raw data from the dataset. 
Checkout the docs for an indepth explaination of necessary dataset structure.
"""
data = read_dataset_from_file(configs)
data

Unnamed: 0_level_0,Synthetic Weather Station Dew Point Temperature,Synthetic Weather Station Diffuse Horizontal Irradiance,Synthetic Weather Station Direct Normal Irradiance,Synthetic Weather Station Dry Bulb Temperature,Synthetic Weather Station Global Horizontal Irradiance,Synthetic Weather Station Relative Humidity,Synthetic Weather Station Wind Speed,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.525850,-0.005199,56.408001,-1.810825,17.930000,10.457981,104.800003
2021-12-01 07:01:00+00:00,15.183906,-1.496226,0.207964,56.174000,-1.841866,17.879999,11.265539,96.650002
2021-12-01 07:02:00+00:00,14.891221,-1.441191,0.457520,55.886002,-1.873579,17.809999,12.777752,96.199997
2021-12-01 07:03:00+00:00,14.836164,-1.371754,0.473117,55.796001,-1.860084,17.820000,12.694983,95.599998
2021-12-01 07:04:00+00:00,14.782966,-1.317349,0.545903,55.723999,-1.843271,17.820000,11.632407,100.650002
...,...,...,...,...,...,...,...,...
2021-12-08 06:55:00+00:00,11.210565,-1.487212,-0.254757,42.285198,-1.814939,24.680000,0.000000,109.664803
2021-12-08 06:56:00+00:00,11.190062,-1.445182,-0.233960,41.997200,-1.852901,24.930000,0.000000,107.002800
2021-12-08 06:57:00+00:00,11.223961,-1.396302,-0.145575,41.669601,-1.841248,25.290001,3.545647,106.480400
2021-12-08 06:58:00+00:00,11.337669,-1.335073,0.171570,41.180000,-1.866031,25.920000,1.386941,110.419998


# process data

### method testing

In [6]:
def correct_predictor_columns_original(configs, data):
    """assert we have the correct columns and order them
    :param configs: configs
    :type configs: dict
    :param data: data
    :type data: pandas.DataFrame
    :raises ConfigsError:if data doesn't contain needed columns
    :return: data with correct columns
    :rtype: pandas.DataFrame
    """
    keep_cols = configs["data_input"]["predictor_columns"] + [
        configs["data_input"]["target_var"]
    ]

    # raise error if missing columns
    missing_colums = set(keep_cols).difference(set(data.columns))
    if len(missing_colums) > 0:
        raise ConfigsError(f"data is missing predictor_columns: {missing_colums}")

    # remove extra columns
    extra_colums = set(data.columns).difference(set(keep_cols))
    if len(extra_colums) > 0:
        data = data[keep_cols]
        logger.info(
            f"Removed columns from data that are not specified in \
            configs['predictor_columns']: {extra_colums}"
        )

    # sort columns
    return data.reindex(keep_cols, axis="columns")

In [7]:
def correct_predictor_columns(configs, data):
    """assert we have the correct columns and order them
    :param configs: configs
    :type configs: dict
    :param data: data
    :type data: pandas.DataFrame
    :raises ConfigsError:if data doesn't contain needed columns
    :return: data with correct columns
    :rtype: pandas.DataFrame
    """
    if configs["data_input"]["predictor_columns"] != []:
        
        keep_cols = configs["data_input"]["predictor_columns"] + [
            configs["data_input"]["target_var"]
        ]

        # raise error if missing columns
        missing_colums = set(keep_cols).difference(set(data.columns))
        if len(missing_colums) > 0:
            raise ConfigsError(f"data is missing predictor_columns: {missing_colums}")

        # remove extra columns
        extra_colums = set(data.columns).difference(set(keep_cols))
        if len(extra_colums) > 0:
            data = data[keep_cols]
            logger.info(
                f"Removed columns from data that are not specified in \
                configs['predictor_columns']: {extra_colums}"
            )
            
    else:
        # not validating pre-defined predictor list
        keep_cols = list(data.columns)

    # sort columns
    return data.reindex(keep_cols, axis="columns")

### apply processing

In [8]:
"""Preprocess data as dictated by the configs.
:param configs: configs
:type configs: dict
:param data: data
:type data: pd.dataframe
:return: data
:rtype: pd.dataframe
"""
# assert we have the correct columns and order them
data = correct_predictor_columns_original(configs, data)
# data = correct_predictor_columns(configs, data)

data

Unnamed: 0_level_0,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1
2021-12-01 07:00:00+00:00,104.800003
2021-12-01 07:01:00+00:00,96.650002
2021-12-01 07:02:00+00:00,96.199997
2021-12-01 07:03:00+00:00,95.599998
2021-12-01 07:04:00+00:00,100.650002
...,...
2021-12-08 06:55:00+00:00,109.664803
2021-12-08 06:56:00+00:00,107.002800
2021-12-08 06:57:00+00:00,106.480400
2021-12-08 06:58:00+00:00,110.419998


In [9]:
# sort and trim data specified time period
data = correct_timestamps(configs, data)

data

Unnamed: 0_level_0,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1
2021-12-01 07:00:00+00:00,104.800003
2021-12-01 07:01:00+00:00,96.650002
2021-12-01 07:02:00+00:00,96.199997
2021-12-01 07:03:00+00:00,95.599998
2021-12-01 07:04:00+00:00,100.650002
...,...
2021-12-08 06:55:00+00:00,109.664803
2021-12-08 06:56:00+00:00,107.002800
2021-12-08 06:57:00+00:00,106.480400
2021-12-08 06:58:00+00:00,110.419998


In [10]:
# Add time-based features
data = add_processed_time_columns(data, configs)

data

Unnamed: 0_level_0,Synthetic Site Electricity Main Total Power,sin_HOD,cos_HOD,HOD_binary_reg_0,HOD_binary_reg_1,HOD_binary_reg_2,HOD_binary_reg_3,HOD_binary_reg_4,HOD_binary_reg_5,HOD_binary_reg_6,...,DOW_binary_reg_6,DOW_binary_fuzzy_0,DOW_binary_fuzzy_1,DOW_binary_fuzzy_2,DOW_binary_fuzzy_3,DOW_binary_fuzzy_4,DOW_binary_fuzzy_5,DOW_binary_fuzzy_6,sin_MOY,cos_MOY
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01 07:00:00+00:00,104.800003,0.965926,-0.258819,0,0,0,0,0,0,0,...,0,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589
2021-12-01 07:01:00+00:00,96.650002,0.964787,-0.263031,0,0,0,0,0,0,0,...,0,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589
2021-12-01 07:02:00+00:00,96.199997,0.963630,-0.267238,0,0,0,0,0,0,0,...,0,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589
2021-12-01 07:03:00+00:00,95.599998,0.962455,-0.271440,0,0,0,0,0,0,0,...,0,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589
2021-12-01 07:04:00+00:00,100.650002,0.961262,-0.275637,0,0,0,0,0,0,0,...,0,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 06:55:00+00:00,109.664803,0.971342,-0.237686,0,0,0,0,0,0,1,...,0,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640
2021-12-08 06:56:00+00:00,107.002800,0.970296,-0.241922,0,0,0,0,0,0,1,...,0,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640
2021-12-08 06:57:00+00:00,106.480400,0.969231,-0.246153,0,0,0,0,0,0,1,...,0,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640
2021-12-08 06:58:00+00:00,110.419998,0.968148,-0.250380,0,0,0,0,0,0,1,...,0,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640


In [11]:
# Add statistics features
data = resample_or_rolling_stats(data, configs)

data

Unnamed: 0_level_0,sin_HOD_min,cos_HOD_min,HOD_binary_reg_0_min,HOD_binary_reg_1_min,HOD_binary_reg_2_min,HOD_binary_reg_3_min,HOD_binary_reg_4_min,HOD_binary_reg_5_min,HOD_binary_reg_6_min,HOD_binary_reg_7_min,...,DOW_binary_fuzzy_0_mean,DOW_binary_fuzzy_1_mean,DOW_binary_fuzzy_2_mean,DOW_binary_fuzzy_3_mean,DOW_binary_fuzzy_4_mean,DOW_binary_fuzzy_5_mean,DOW_binary_fuzzy_6_mean,sin_MOY_mean,cos_MOY_mean,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01 07:00:00+00:00,0.965926,-2.588190e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,104.800003
2021-12-01 07:15:00+00:00,0.946930,-3.214395e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,103.650002
2021-12-01 07:30:00+00:00,0.923880,-3.826834e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,98.050003
2021-12-01 07:45:00+00:00,0.896873,-4.422887e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.708333,0.291667,0.0,0.0,0.0,-0.493776,0.869589,98.349998
2021-12-01 08:00:00+00:00,0.866025,-5.000000e-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.705556,0.294444,0.0,0.0,0.0,-0.493776,0.869589,102.949997
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 06:00:00+00:00,0.998135,-1.608123e-16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.788889,0.211111,0.0,0.0,0.0,-0.385663,0.922640,104.668396
2021-12-08 06:15:00+00:00,0.997859,-6.540313e-02,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,102.709602
2021-12-08 06:30:00+00:00,0.991445,-1.305262e-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,105.972198
2021-12-08 06:45:00+00:00,0.980785,-1.950903e-01,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,112.989998


In [12]:
# Add lag features
configs["input_dim"] = data.shape[1] - 1
logger.info("Number of features: {}".format(configs["input_dim"]))
logger.debug("Features: {}".format(data.columns.values))

if configs["learning_algorithm"]["arch_version"] == "alfa":
    data = timelag_predictors(data, configs)
elif configs["learning_algorithm"]["arch_version"] == "bravo":
    data = timelag_predictors_target(data, configs)
elif configs["learning_algorithm"]["arch_version"] == "charlie":
    data = roll_predictors_target(data, configs)

data

Unnamed: 0_level_0,sin_HOD_min_lag24,cos_HOD_min_lag24,HOD_binary_reg_0_min_lag24,HOD_binary_reg_1_min_lag24,HOD_binary_reg_2_min_lag24,HOD_binary_reg_3_min_lag24,HOD_binary_reg_4_min_lag24,HOD_binary_reg_5_min_lag24,HOD_binary_reg_6_min_lag24,HOD_binary_reg_7_min_lag24,...,DOW_binary_fuzzy_0_mean,DOW_binary_fuzzy_1_mean,DOW_binary_fuzzy_2_mean,DOW_binary_fuzzy_3_mean,DOW_binary_fuzzy_4_mean,DOW_binary_fuzzy_5_mean,DOW_binary_fuzzy_6_mean,sin_MOY_mean,cos_MOY_mean,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01 13:00:00+00:00,0.965926,-0.258819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.497222,0.502778,0.0,0.0,0.0,-0.493776,0.869589,99.349998
2021-12-01 13:15:00+00:00,0.946930,-0.321439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.458333,0.541667,0.0,0.0,0.0,-0.493776,0.869589,99.800003
2021-12-01 13:30:00+00:00,0.923880,-0.382683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.458333,0.541667,0.0,0.0,0.0,-0.493776,0.869589,98.900002
2021-12-01 13:45:00+00:00,0.896873,-0.442289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.458333,0.541667,0.0,0.0,0.0,-0.493776,0.869589,101.500000
2021-12-01 14:00:00+00:00,0.866025,-0.500000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.455556,0.544444,0.0,0.0,0.0,-0.493776,0.869589,98.504036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 06:00:00+00:00,-0.061049,0.998135,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.788889,0.211111,0.0,0.0,0.0,-0.385663,0.922640,104.668396
2021-12-08 06:15:00+00:00,0.004363,0.997859,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,102.709602
2021-12-08 06:30:00+00:00,0.069756,0.991445,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,105.972198
2021-12-08 06:45:00+00:00,0.134851,0.980785,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.750000,0.250000,0.0,0.0,0.0,-0.385663,0.922640,112.989998


In [13]:
# if validatate with external data, write data to h5 for future testing.
if (
    configs["learning_algorithm"]["use_case"] == "validation"
    and configs["learning_algorithm"]["test_method"] == "external"
):
    filepath = pathlib.Path(
        configs["data_input"]["data_dir"]
    ) / "{}_external_test.h5".format(configs["data_input"]["target_var"])
    data.to_hdf(filepath, key="df", mode="w")

if configs["learning_algorithm"]["use_case"] == "train":
    train_df, val_df = input_data_split(data, configs)

else:
    train_df, val_df = pd.DataFrame(), 

In [14]:
"""
Finally, we are ready to train our model!
"""
init_logging(local_results_dir=configs["data_output"]["exp_dir"])
model = ModelFactory.create_model(configs)
model.train(train_df, val_df)

Logging to: C:\Users\JKIM4\Documents\GitHub\intelligentcampus-pred-analytics\notebooks\exp_dir\output.out, PID: 20556
saving timeseries comparison in C:\Users\JKIM4\Documents\GitHub\intelligentcampus-pred-analytics\notebooks\exp_dir/Vis_TimeseriesComparisons.svg
