In [1]:
import json 
import os
from pathlib import Path
import logging
from intelcamp.entry_point import create_input_dataframe, run_model
import pandas as pd
from pandas.tseries.frequencies import to_offset

import sys
import importlib
from  intelcamp import data_preprocessing
import json
import intelcamp.buildings_processing as bp
import pathlib
from  intelcamp import util

import numpy as np

logger = logging.getLogger(str(os.getpid()))

In [2]:
def input_data_split(data, configs):
    """
    Split a data set into a training set and a validation (val) set.
    Methods: "Random" or "Sequential", specified in configs

    :param data: (DataFrame)
    :param configs: (Dict)
    :return:
    """
    train_ratio = int(configs["data_split"].split(":")[0])/100
    val_ratio = int(configs["data_split"].split(":")[1])/100
    test_ratio = int(configs["data_split"].split(":")[2])/100

    file_prefix = Path(configs["exp_dir"])

    if configs['train_val_split'] == 'Random':
        pathlib.Path(configs["data_dir"]).mkdir(parents=True, exist_ok=True)
        mask_file = os.path.join(file_prefix, "mask.h5")
        logger.info("Creating random training mask and writing to file")

        # If you want to group datasets together into sequential chunks
        if configs["splicer"]["active"]:
            # Set indices for training set
            np.random.seed(seed=configs["random_seed"])
            splicer = ((data.index - data.index[0]) // pd.Timedelta(configs["splicer"]["time"])).values
            num_chunks = splicer[-1]
            num_train_chunks = (train_ratio * num_chunks) - ((train_ratio * num_chunks) % configs["train_size_factor"])
            msk = np.zeros(data.shape[0]) + 2
            train_chunks = np.random.choice(np.arange(num_chunks), replace=False, size=int(num_train_chunks))
            for chunk in train_chunks:
                indices = np.where(splicer == chunk)
                msk[indices] = 0

            # Set indices for validation and test set
            remaining_chunks = np.setdiff1d(np.arange(num_chunks), train_chunks)
            if test_ratio == 0:
                msk[msk != 0] = 1
            else:
                num_val_chunks = int((val_ratio / (1-train_ratio)) * remaining_chunks.shape[0])
                val_chunks = np.random.choice(remaining_chunks, replace=False, size=num_val_chunks)
                for chunk in val_chunks:
                    indices = np.where(splicer == chunk)
                    msk[indices] = 1

        # If you DONT want to group data into sequential chunks
        else:
            # Set indices for training set
            np.random.seed(seed=configs["random_seed"])
            data_size = data.shape[0]
            num_ones = (train_ratio * data_size) - ((train_ratio * data_size) % configs["train_size_factor"])
            msk = np.zeros(data_size) + 2
            indices = np.random.choice(np.arange(data_size), replace=False, size=int(num_ones))
            msk[indices] = 0

            # Set indices for validation and test set
            remaining_indices = np.where(msk != 0)[0]
            if test_ratio == 0:
                msk[remaining_indices] = 1
            else:
                num_val = int((val_ratio / (1-train_ratio)) * remaining_indices.shape[0])
                val_indices = np.random.choice(remaining_indices, replace=False, size=num_val)
                msk[val_indices] = 1


        logger.info("Train: {}, validation: {}, test: {}".format((msk == 0).sum()/msk.shape[0], (msk == 1).sum()/msk.shape[0], (msk == 2).sum()/msk.shape[0]))
        # Assign dataframes
        train_df = data[msk == 0]
        val_df = data[msk == 1]
        test_df = data[msk == 2]

        # Save test_df to file for later use
        test_df.to_hdf(os.path.join(file_prefix, "internal_test.h5"), key='df', mode='w')

        # Still save dataframe to file to preserve timeseries index
        mask = pd.DataFrame()
        mask['msk'] = msk
        mask.index = data.index
        mask.to_hdf(mask_file, key='df', mode='w')

        # Get rid of datetime index
        train_df.reset_index(drop=True, inplace=True)
        val_df.reset_index(drop=True, inplace=True)

    else:
        raise ConfigsError("{} is not a supported form of data splitting".format(configs['train_val_split']))

    return train_df, val_df

### reading configs file

In [3]:
with open("intelcamp/configs.json", "r") as f:
    configs = json.load(f)
    
configs

{'building': 'Synthetic Site',
 'target_var': 'Synthetic Site Electricity Main Total Power',
 'start_year': 2018,
 'start_month': 1,
 'start_day': 1,
 'end_year': 2021,
 'end_month': 12,
 'end_day': 31,
 'data_time_interval_mins': 1,
 'weather_include': [],
 'arch_version': 4,
 'exp_id': '7-9-21',
 'arch_type': 'RNN',
 'arch_type_variant': 'lstm',
 'preprocess': False,
 'fetch_n_parse': False,
 'transformation_method': 'minmaxscale',
 'train_batch_size': 26,
 'val_batch_size': 1,
 'convert_csvs': False,
 'exp_dir': 'exp_dir',
 'data_dir': 'data',
 'resample_freq': 15,
 'sequence_freq_min': 15,
 'splicer': {'active': False, 'time': '12hr'},
 'rolling_window': {'active': True, 'type': 'binned', 'minutes': 15},
 'window': 24,
 'EC_future_gap_min': 0,
 'DOW': ['binary_reg'],
 'MOY': ['sincos'],
 'HOD': ['sincos'],
 'Holidays': False,
 'S2S_stagger': {'initial_num': 72, 'decay': 0, 'secondary_num': 0},
 'train_size_factor': 1,
 'train_val_split': 'Random',
 'data_split': '80:10:10',
 'rando

### switching use_case parameter for prediction workflow testing

In [4]:
configs["use_case"] = 'validation'

### going through create_input_dataframe function

In [5]:
local_results_dir = util.Path(configs["exp_dir"])

In [6]:
# Preprocess if needed
if configs['preprocess']:
    train_df, val_df, configs = data_preprocessing.main(configs)

    # save the data fetch_n_parseed from API and piped through data_preprocessing (i.e. train_df and val_df)
    train_df.to_csv('./data/STM_Train_Data.csv')
    val_df.to_csv('./data/STM_Test_Data.csv')
else:
    # preprocessing module defines target_feat_name list and sends it back.
    configs['target_feat_name'] = [configs['target_var']]

In [7]:
# Get the dataset
if configs["use_case"] == "validation" and configs["test_method"] == "internal":
    data = pd.read_hdf(os.path.join(local_results_dir, "internal_test.h5"))
else:
    data = bp.get_full_data(configs)

In [8]:
# if certain predictor variables are pre-defined, then include only those.
if configs['weather_include']:
    keep_cols = configs['weather_include'] + [configs['target_var']]
    data = data[keep_cols]
    logger.info("columns specified in the configs.json are only included")
else:
    logger.info("all available predictor variables and target variable ({}) are included".format(configs['target_var']))

In [9]:
data

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature,SRRL BMS Diffuse Horizontal Irradiance,SRRL BMS Direct Normal Irradiance,SRRL BMS Dry Bulb Temperature,SRRL BMS Global Horizontal Irradiance,SRRL BMS Relative Humidity,SRRL BMS Wind Speed at 19',Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.525850,-0.005199,56.408001,-1.810825,17.930000,10.457981,104.800003
2021-12-01 07:01:00+00:00,15.183906,-1.496226,0.207964,56.174000,-1.841866,17.879999,11.265539,96.650002
2021-12-01 07:02:00+00:00,14.891221,-1.441191,0.457520,55.886002,-1.873579,17.809999,12.777752,96.199997
2021-12-01 07:03:00+00:00,14.836164,-1.371754,0.473117,55.796001,-1.860084,17.820000,12.694983,95.599998
2021-12-01 07:04:00+00:00,14.782966,-1.317349,0.545903,55.723999,-1.843271,17.820000,11.632407,100.650002
...,...,...,...,...,...,...,...,...
2021-12-08 06:55:00+00:00,11.210565,-1.487212,-0.254757,42.285198,-1.814939,24.680000,0.000000,109.664803
2021-12-08 06:56:00+00:00,11.190062,-1.445182,-0.233960,41.997200,-1.852901,24.930000,0.000000,107.002800
2021-12-08 06:57:00+00:00,11.223961,-1.396302,-0.145575,41.669601,-1.841248,25.290001,3.545647,106.480400
2021-12-08 06:58:00+00:00,11.337669,-1.335073,0.171570,41.180000,-1.866031,25.920000,1.386941,110.419998


In [10]:
data.isna().sum().sum()

23

### replace with incomplete data: copying from original dataframe

In [11]:
# data_temp = data.loc["2021-12-01":"2021-12-01" :,].copy()
data_temp = data.copy()
data_temp

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature,SRRL BMS Diffuse Horizontal Irradiance,SRRL BMS Direct Normal Irradiance,SRRL BMS Dry Bulb Temperature,SRRL BMS Global Horizontal Irradiance,SRRL BMS Relative Humidity,SRRL BMS Wind Speed at 19',Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.525850,-0.005199,56.408001,-1.810825,17.930000,10.457981,104.800003
2021-12-01 07:01:00+00:00,15.183906,-1.496226,0.207964,56.174000,-1.841866,17.879999,11.265539,96.650002
2021-12-01 07:02:00+00:00,14.891221,-1.441191,0.457520,55.886002,-1.873579,17.809999,12.777752,96.199997
2021-12-01 07:03:00+00:00,14.836164,-1.371754,0.473117,55.796001,-1.860084,17.820000,12.694983,95.599998
2021-12-01 07:04:00+00:00,14.782966,-1.317349,0.545903,55.723999,-1.843271,17.820000,11.632407,100.650002
...,...,...,...,...,...,...,...,...
2021-12-08 06:55:00+00:00,11.210565,-1.487212,-0.254757,42.285198,-1.814939,24.680000,0.000000,109.664803
2021-12-08 06:56:00+00:00,11.190062,-1.445182,-0.233960,41.997200,-1.852901,24.930000,0.000000,107.002800
2021-12-08 06:57:00+00:00,11.223961,-1.396302,-0.145575,41.669601,-1.841248,25.290001,3.545647,106.480400
2021-12-08 06:58:00+00:00,11.337669,-1.335073,0.171570,41.180000,-1.866031,25.920000,1.386941,110.419998


### replace with incomplete data: making some columns with larget measurement interval

In [12]:
list_cols = ['SRRL BMS Relative Humidity', 'SRRL BMS Wind Speed at 19\'', 'Synthetic Site Electricity Main Total Power']
list_interval_mins = [5, 12, 10]
list_timeshift_mins = [0, 3, 7]

In [13]:
i=0
    
for col, timestep, loffset in zip(list_cols, list_interval_mins, list_timeshift_mins):

    print("resampling and shifting column = {} with resampling timestep of {} and time-shift of {}".format(col, timestep, loffset))

    minutes = str(timestep) + "T"
    loffset = str(loffset) + "min" 
    df_temp = data_temp[col].resample(minutes).mean().copy()
    df_temp.index = df_temp.index + to_offset(loffset)
    data_temp[col] = df_temp

#         if i==0:
#             break

#     if i==0:
#         break

resampling and shifting column = SRRL BMS Relative Humidity with resampling timestep of 5 and time-shift of 0
resampling and shifting column = SRRL BMS Wind Speed at 19' with resampling timestep of 12 and time-shift of 3
resampling and shifting column = Synthetic Site Electricity Main Total Power with resampling timestep of 10 and time-shift of 7


In [14]:
data_temp.head(20)

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature,SRRL BMS Diffuse Horizontal Irradiance,SRRL BMS Direct Normal Irradiance,SRRL BMS Dry Bulb Temperature,SRRL BMS Global Horizontal Irradiance,SRRL BMS Relative Humidity,SRRL BMS Wind Speed at 19',Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.52585,-0.005199,56.408001,-1.810825,17.852,,
2021-12-01 07:01:00+00:00,15.183906,-1.496226,0.207964,56.174,-1.841866,,,
2021-12-01 07:02:00+00:00,14.891221,-1.441191,0.45752,55.886002,-1.873579,,,
2021-12-01 07:03:00+00:00,14.836164,-1.371754,0.473117,55.796001,-1.860084,,11.421756,
2021-12-01 07:04:00+00:00,14.782966,-1.317349,0.545903,55.723999,-1.843271,,,
2021-12-01 07:05:00+00:00,15.032702,-1.280413,0.644685,55.526001,-1.856863,18.468,,
2021-12-01 07:06:00+00:00,14.988927,-1.259775,0.649885,55.346001,-1.839231,,,
2021-12-01 07:07:00+00:00,15.09557,-1.256177,0.629089,55.220001,-1.841384,,,99.54
2021-12-01 07:08:00+00:00,15.158759,-1.238111,0.441922,55.112,-1.832426,,,
2021-12-01 07:09:00+00:00,15.601736,-1.246259,0.150773,55.166,-1.813277,,,


In [15]:
data_temp.isna().sum().sum()

26392

### replace with incomplete data: putting NANs in random places

In [16]:
fraction = 0.0

In [17]:
list_index_random = data_temp.sample(frac=fraction, replace=False, random_state=1).index.tolist()
list_column_random = pd.DataFrame(data_temp.columns).sample(frac=fraction, replace=False, random_state=2).iloc[:,0].tolist()

In [18]:
list_column_random

[]

In [19]:
i=0
for ind in list_index_random:
    
    for col in list_column_random:
        
        print("replacing value in index = {} and column = {} to blank".format(ind, col))
        data_temp.loc[ data_temp.index==ind , data_temp.columns==col ] = np.NAN
        

In [20]:
data_temp

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature,SRRL BMS Diffuse Horizontal Irradiance,SRRL BMS Direct Normal Irradiance,SRRL BMS Dry Bulb Temperature,SRRL BMS Global Horizontal Irradiance,SRRL BMS Relative Humidity,SRRL BMS Wind Speed at 19',Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.525850,-0.005199,56.408001,-1.810825,17.852,,
2021-12-01 07:01:00+00:00,15.183906,-1.496226,0.207964,56.174000,-1.841866,,,
2021-12-01 07:02:00+00:00,14.891221,-1.441191,0.457520,55.886002,-1.873579,,,
2021-12-01 07:03:00+00:00,14.836164,-1.371754,0.473117,55.796001,-1.860084,,11.421756,
2021-12-01 07:04:00+00:00,14.782966,-1.317349,0.545903,55.723999,-1.843271,,,
...,...,...,...,...,...,...,...,...
2021-12-08 06:55:00+00:00,11.210565,-1.487212,-0.254757,42.285198,-1.814939,25.384,,
2021-12-08 06:56:00+00:00,11.190062,-1.445182,-0.233960,41.997200,-1.852901,,,
2021-12-08 06:57:00+00:00,11.223961,-1.396302,-0.145575,41.669601,-1.841248,,,107.30626
2021-12-08 06:58:00+00:00,11.337669,-1.335073,0.171570,41.180000,-1.866031,,,


In [21]:
data_temp.isna().sum().sum()

26392

### copying incomplete data back to the original dataframe

In [22]:
data = data_temp.copy()

### cleaning 1: it will not execute for prediction

In [23]:
# Do some preprocessing, but only if the dataset needs it
if configs["use_case"] == "training":
    data = bp.clean_data(data, configs)

In [24]:
data

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature,SRRL BMS Diffuse Horizontal Irradiance,SRRL BMS Direct Normal Irradiance,SRRL BMS Dry Bulb Temperature,SRRL BMS Global Horizontal Irradiance,SRRL BMS Relative Humidity,SRRL BMS Wind Speed at 19',Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.525850,-0.005199,56.408001,-1.810825,17.852,,
2021-12-01 07:01:00+00:00,15.183906,-1.496226,0.207964,56.174000,-1.841866,,,
2021-12-01 07:02:00+00:00,14.891221,-1.441191,0.457520,55.886002,-1.873579,,,
2021-12-01 07:03:00+00:00,14.836164,-1.371754,0.473117,55.796001,-1.860084,,11.421756,
2021-12-01 07:04:00+00:00,14.782966,-1.317349,0.545903,55.723999,-1.843271,,,
...,...,...,...,...,...,...,...,...
2021-12-08 06:55:00+00:00,11.210565,-1.487212,-0.254757,42.285198,-1.814939,25.384,,
2021-12-08 06:56:00+00:00,11.190062,-1.445182,-0.233960,41.997200,-1.852901,,,
2021-12-08 06:57:00+00:00,11.223961,-1.396302,-0.145575,41.669601,-1.841248,,,107.30626
2021-12-08 06:58:00+00:00,11.337669,-1.335073,0.171570,41.180000,-1.866031,,,


In [25]:
data.isna().sum().sum()

26392

### time-based feature extraction

In [26]:
# Add time-based features 
data = bp.time_dummies(data, configs)

In [27]:
data

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature,SRRL BMS Diffuse Horizontal Irradiance,SRRL BMS Direct Normal Irradiance,SRRL BMS Dry Bulb Temperature,SRRL BMS Global Horizontal Irradiance,SRRL BMS Relative Humidity,SRRL BMS Wind Speed at 19',Synthetic Site Electricity Main Total Power,sin_HOD,cos_HOD,DOW_binary_reg_0,DOW_binary_reg_1,DOW_binary_reg_2,DOW_binary_reg_3,DOW_binary_reg_4,DOW_binary_reg_5,DOW_binary_reg_6,sin_MOY,cos_MOY
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.525850,-0.005199,56.408001,-1.810825,17.852,,,0.965926,-0.258819,0,0,1,0,0,0,0,-0.493776,0.869589
2021-12-01 07:01:00+00:00,15.183906,-1.496226,0.207964,56.174000,-1.841866,,,,0.964787,-0.263031,0,0,1,0,0,0,0,-0.493776,0.869589
2021-12-01 07:02:00+00:00,14.891221,-1.441191,0.457520,55.886002,-1.873579,,,,0.963630,-0.267238,0,0,1,0,0,0,0,-0.493776,0.869589
2021-12-01 07:03:00+00:00,14.836164,-1.371754,0.473117,55.796001,-1.860084,,11.421756,,0.962455,-0.271440,0,0,1,0,0,0,0,-0.493776,0.869589
2021-12-01 07:04:00+00:00,14.782966,-1.317349,0.545903,55.723999,-1.843271,,,,0.961262,-0.275637,0,0,1,0,0,0,0,-0.493776,0.869589
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 06:55:00+00:00,11.210565,-1.487212,-0.254757,42.285198,-1.814939,25.384,,,0.971342,-0.237686,0,0,1,0,0,0,0,-0.385663,0.922640
2021-12-08 06:56:00+00:00,11.190062,-1.445182,-0.233960,41.997200,-1.852901,,,,0.970296,-0.241922,0,0,1,0,0,0,0,-0.385663,0.922640
2021-12-08 06:57:00+00:00,11.223961,-1.396302,-0.145575,41.669601,-1.841248,,,107.30626,0.969231,-0.246153,0,0,1,0,0,0,0,-0.385663,0.922640
2021-12-08 06:58:00+00:00,11.337669,-1.335073,0.171570,41.180000,-1.866031,,,,0.968148,-0.250380,0,0,1,0,0,0,0,-0.385663,0.922640


In [28]:
data.isna().sum().sum()

26392

### statistics-based feature extraction

In [29]:
# Add statistics features 
if configs["rolling_window"]["active"]:
    data = bp.rolling_stats(data, configs)

In [30]:
data

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature_min,SRRL BMS Diffuse Horizontal Irradiance_min,SRRL BMS Direct Normal Irradiance_min,SRRL BMS Dry Bulb Temperature_min,SRRL BMS Global Horizontal Irradiance_min,SRRL BMS Relative Humidity_min,SRRL BMS Wind Speed at 19'_min,sin_HOD_min,cos_HOD_min,DOW_binary_reg_0_min,...,DOW_binary_reg_0_max,DOW_binary_reg_1_max,DOW_binary_reg_2_max,DOW_binary_reg_3_max,DOW_binary_reg_4_max,DOW_binary_reg_5_max,DOW_binary_reg_6_max,sin_MOY_max,cos_MOY_max,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01 07:00:00+00:00,14.782966,-1.525850,-0.540707,55.112000,-1.873579,17.852,11.421756,0.948324,-0.317305,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,99.540000
2021-12-01 07:15:00+00:00,15.474850,-1.388764,-1.663717,54.608002,-1.918852,18.922,9.984669,0.925541,-0.378649,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,99.620001
2021-12-01 07:30:00+00:00,16.633362,-1.642929,-2.147240,53.978001,-2.029242,19.872,6.986714,0.898794,-0.438371,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,99.455000
2021-12-01 07:45:00+00:00,16.413343,-1.447732,-0.426325,52.807999,-1.979830,21.134,5.593063,0.868199,-0.496217,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,100.657500
2021-12-01 08:00:00+00:00,16.649153,-1.512877,-0.613491,52.034000,-2.002243,21.676,3.771957,0.833886,-0.551937,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,100.135001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 05:45:00+00:00,11.961447,-1.673728,-1.211398,40.976601,-1.908751,25.408,0.000000,0.997859,0.004363,0,...,0,0,1,0,0,0,0,-0.385663,0.922640,108.435450
2021-12-08 06:00:00+00:00,11.080105,-1.489343,-0.504314,41.646198,-1.936419,24.888,0.000000,0.998135,-0.061049,0,...,0,0,1,0,0,0,0,-0.385663,0.922640,108.112099
2021-12-08 06:15:00+00:00,10.851442,-1.571301,-0.655083,42.024200,-1.834018,23.926,0.000000,0.992005,-0.126199,0,...,0,0,1,0,0,0,0,-0.385663,0.922640,108.059710
2021-12-08 06:30:00+00:00,10.827111,-1.430832,-0.826657,40.960400,-1.865315,24.442,3.595607,0.981627,-0.190809,0,...,0,0,1,0,0,0,0,-0.385663,0.922640,108.386578


In [31]:
data.isna().sum().sum()

0

In [32]:
data = data.dropna(how="any")
data

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature_min,SRRL BMS Diffuse Horizontal Irradiance_min,SRRL BMS Direct Normal Irradiance_min,SRRL BMS Dry Bulb Temperature_min,SRRL BMS Global Horizontal Irradiance_min,SRRL BMS Relative Humidity_min,SRRL BMS Wind Speed at 19'_min,sin_HOD_min,cos_HOD_min,DOW_binary_reg_0_min,...,DOW_binary_reg_0_max,DOW_binary_reg_1_max,DOW_binary_reg_2_max,DOW_binary_reg_3_max,DOW_binary_reg_4_max,DOW_binary_reg_5_max,DOW_binary_reg_6_max,sin_MOY_max,cos_MOY_max,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01 07:00:00+00:00,14.782966,-1.525850,-0.540707,55.112000,-1.873579,17.852,11.421756,0.948324,-0.317305,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,99.540000
2021-12-01 07:15:00+00:00,15.474850,-1.388764,-1.663717,54.608002,-1.918852,18.922,9.984669,0.925541,-0.378649,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,99.620001
2021-12-01 07:30:00+00:00,16.633362,-1.642929,-2.147240,53.978001,-2.029242,19.872,6.986714,0.898794,-0.438371,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,99.455000
2021-12-01 07:45:00+00:00,16.413343,-1.447732,-0.426325,52.807999,-1.979830,21.134,5.593063,0.868199,-0.496217,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,100.657500
2021-12-01 08:00:00+00:00,16.649153,-1.512877,-0.613491,52.034000,-2.002243,21.676,3.771957,0.833886,-0.551937,0,...,0,0,1,0,0,0,0,-0.493776,0.869589,100.135001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 05:45:00+00:00,11.961447,-1.673728,-1.211398,40.976601,-1.908751,25.408,0.000000,0.997859,0.004363,0,...,0,0,1,0,0,0,0,-0.385663,0.922640,108.435450
2021-12-08 06:00:00+00:00,11.080105,-1.489343,-0.504314,41.646198,-1.936419,24.888,0.000000,0.998135,-0.061049,0,...,0,0,1,0,0,0,0,-0.385663,0.922640,108.112099
2021-12-08 06:15:00+00:00,10.851442,-1.571301,-0.655083,42.024200,-1.834018,23.926,0.000000,0.992005,-0.126199,0,...,0,0,1,0,0,0,0,-0.385663,0.922640,108.059710
2021-12-08 06:30:00+00:00,10.827111,-1.430832,-0.826657,40.960400,-1.865315,24.442,3.595607,0.981627,-0.190809,0,...,0,0,1,0,0,0,0,-0.385663,0.922640,108.386578


### time-lag feature extraction

In [33]:
# Add lag features
configs['input_dim'] = data.shape[1] - 1
logger.info("Number of features: {}".format(configs['input_dim']))
logger.debug("Features: {}".format(data.columns.values))
if configs["arch_version"] == 4:
    data = bp.pad_full_data(data, configs)
elif configs["arch_version"] == 5:
    data = bp.pad_full_data_s2s(data, configs)

In [34]:
data

Unnamed: 0_level_0,SRRL BMS Dew Point Temperature_min_lag24,SRRL BMS Diffuse Horizontal Irradiance_min_lag24,SRRL BMS Direct Normal Irradiance_min_lag24,SRRL BMS Dry Bulb Temperature_min_lag24,SRRL BMS Global Horizontal Irradiance_min_lag24,SRRL BMS Relative Humidity_min_lag24,SRRL BMS Wind Speed at 19'_min_lag24,sin_HOD_min_lag24,cos_HOD_min_lag24,DOW_binary_reg_0_min_lag24,...,DOW_binary_reg_0_max,DOW_binary_reg_1_max,DOW_binary_reg_2_max,DOW_binary_reg_3_max,DOW_binary_reg_4_max,DOW_binary_reg_5_max,DOW_binary_reg_6_max,sin_MOY_max,cos_MOY_max,Synthetic Site Electricity Main Total Power
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-12-01 13:00:00+00:00,14.782966,-1.525850,-0.540707,55.112000,-1.873579,17.851999,11.421756,0.948324,-0.317305,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,100.645001
2021-12-01 13:15:00+00:00,15.474850,-1.388764,-1.663717,54.608002,-1.918852,18.921999,9.984669,0.925541,-0.378649,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,100.140001
2021-12-01 13:30:00+00:00,16.633362,-1.642929,-2.147240,53.978001,-2.029242,19.872000,6.986714,0.898794,-0.438371,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,100.130001
2021-12-01 13:45:00+00:00,16.413343,-1.447732,-0.426325,52.807999,-1.979830,21.133999,5.593062,0.868199,-0.496217,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,100.426001
2021-12-01 14:00:00+00:00,16.649153,-1.512877,-0.613491,52.034000,-2.002243,21.676001,3.771957,0.833886,-0.551937,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.493776,0.869589,99.733766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-08 05:45:00+00:00,16.623901,-0.804934,-0.644684,47.695999,-1.170910,26.174000,17.456253,-0.065403,0.997859,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.385663,0.922640,108.435450
2021-12-08 06:00:00+00:00,16.985941,-0.961478,-0.831852,47.714001,-1.326790,26.430000,21.383869,0.000000,0.998135,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.385663,0.922640,108.112099
2021-12-08 06:15:00+00:00,16.848026,-1.045253,-0.634286,47.588001,-1.507714,26.780001,17.719288,0.065403,0.992005,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.385663,0.922640,108.059710
2021-12-08 06:30:00+00:00,16.982306,-1.165292,-0.499114,47.192001,-1.503043,26.993999,16.742464,0.130526,0.981627,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.385663,0.922640,108.386578


In [35]:
data.isna().sum().sum()

0

### cleaning 2: it will not execute for prediction

In [37]:
# if configs["use_case"] == "training":
#     data = data.loc[:, (data != 0).any(axis=0)]

In [38]:
# data

### input_data_split

In [39]:
# if configs["use_case"] == "train" or configs["use_case"] == "prediction":
#     # split data into training/validation/testing sets
#     train_df, val_df = input_data_split(data, configs)