In [1]:
import json 
import os
from pathlib import Path
import logging
from intelcamp.entry_point import create_input_dataframe, run_model
import pandas as pd

logger = logging.getLogger(str(os.getpid()))

In [2]:
with open("intelcamp/configs.json", "r") as f:
    configs = json.load(f)
    
configs

{'building': 'Synthetic Site',
 'target_var': 'Synthetic Site Electricity Main Total Power',
 'start_year': 2018,
 'start_month': 1,
 'start_day': 1,
 'end_year': 2021,
 'end_month': 12,
 'end_day': 31,
 'data_time_interval_mins': 1,
 'weather_include': [],
 'arch_version': 4,
 'exp_id': '7-9-21',
 'arch_type': 'RNN',
 'arch_type_variant': 'lstm',
 'preprocess': False,
 'fetch_n_parse': False,
 'transformation_method': 'minmaxscale',
 'train_batch_size': 26,
 'val_batch_size': 1,
 'convert_csvs': False,
 'exp_dir': 'exp_dir',
 'data_dir': 'data',
 'resample_freq': 15,
 'sequence_freq_min': 15,
 'splicer': {'active': False, 'time': '12hr'},
 'rolling_window': {'active': True, 'type': 'binned', 'minutes': 15},
 'window': 24,
 'EC_future_gap_min': 0,
 'DOW': ['binary_reg'],
 'MOY': ['sincos'],
 'HOD': ['sincos'],
 'Holidays': False,
 'S2S_stagger': {'initial_num': 72, 'decay': 0, 'secondary_num': 0},
 'train_size_factor': 1,
 'train_val_split': 'Random',
 'data_split': '80:10:10',
 'rando

In [3]:
"""
Fetches all data for a requested building based on the information reflected in the input data summary json file.

:param configs: (Dictionary)
:return: (DataFrame)
"""

# assuming there is only one json file in the folder summerizing input data
# read json file
configs_file_inputdata = Path(configs['data_dir']) / configs['building'] / f"{configs['building']} Config.json"
logger.info("Pre-process: reading input data summary json file from {}".format(configs_file_inputdata))
with open(configs_file_inputdata, "r") as read_file:
    configs_input = json.load(read_file)

# converting json into dataframe 
df_inputdata = pd.DataFrame(configs_input['files'])

######################################################################
######################################################################
# collecting datapoint information
df_datapoints = pd.DataFrame(configs_input['predictors']).append(pd.DataFrame(configs_input['targets']), ignore_index=True)
dict_datapoints = dict(zip(df_datapoints.column, df_datapoints.id))
configs['skyspark_datapoint_info'] = dict_datapoints
######################################################################
######################################################################

# converting date time column into pandas datetime (raw format based on ISO 8601)
df_inputdata['start'] = pd.to_datetime(df_inputdata.start, format="t:%Y-%m-%dT%H:%M:%S%z", exact=False, utc=True)
df_inputdata['end'] = pd.to_datetime(df_inputdata.end, format="t:%Y-%m-%dT%H:%M:%S%z", exact=False, utc=True)

# creating thresholds dates from configs json file
timestamp_start = pd.Timestamp(configs['start_year'], configs['start_month'], configs['start_day'], 0)
if (configs['end_month']==12) & (configs['end_day']==31):
    timestamp_end = pd.Timestamp(configs['end_year']+1, 1, 1, 0)
else:
    timestamp_end = pd.Timestamp(configs['end_year'], configs['end_month']+1, configs['end_day'], 0)

# filtering input data based on user specified date period
df_inputdata = df_inputdata.loc[ (df_inputdata.start.dt.date>=timestamp_start) & (df_inputdata.end.dt.date<=timestamp_end) , :]
df_inputdata['path'] = configs['data_dir'] + "/" + configs['building'] + "/" + df_inputdata['filename']

if df_inputdata.empty:
    logger.info("Pre-process: measurements during the specified time period ({} to {}) are empty.".format(timestamp_start, timestamp_end))

    raise ConfigsError("No datapoints found in dataset for specified timeframe.")

else:
    data_full_p = pd.DataFrame()
    data_full_t = pd.DataFrame()
    for datatype in df_inputdata.contentType.unique():

        df_list_datatype = df_inputdata.loc[df_inputdata.contentType==datatype,:]

        for filepath in df_list_datatype.path:

            if datatype=="predictors":
                logger.info("Pre-process: reading predictor file = {}".format(filepath.split(configs['data_dir'])[1]))
                try:
                    data_full_p = pd.concat([data_full_p, pd.read_csv(filepath)])
                except:
                    logger.info("Pre-process: error in read_csv with predictor file {}. not reading..".format(filepath.split(configs['data_dir'])[1]))
                    continue
            elif datatype=="targets":
                logger.info("Pre-process: reading target file = {}".format(filepath.split(configs['data_dir'])[1]))
                try:
                    data_full_t = pd.concat([data_full_t, pd.read_csv(filepath)[['Timestamp', configs["target_var"]]]])
                except:
                    logger.info("Pre-process: error in read_csv with target file {}. not reading..".format(filepath.split(configs['data_dir'])[1]))
                    continue
            else:
                logger.info("Pre-process: input file not properly differentiated between Predictors and Targets")

    if data_full_p.empty:
        logger.info("Pre-process: predictor dataframe is empty. Exiting process...")

        raise ConfigsError("No datapoints found in dataset for specified timeframe.")

    elif data_full_t.empty and configs["use_case"] != "prediction":
        logger.info("Pre-process: target dataframe is empty. Exiting process...")

        raise ConfigsError("No datapoints found in dataset for specified timeframe.")          

    if configs["use_case"] == "prediction":
        data_full = data_full_p
        data_full[configs["target_var"]] = -999

    else:
        data_full = pd.merge(data_full_p, data_full_t, how='outer', on='Timestamp')

data_full['Timestamp'] = pd.to_datetime(data_full['Timestamp'], format="%Y-%m-%dT%H:%M:%S%z", exact=False, utc=True)
data_full = data_full.set_index('Timestamp')

In [4]:
df_datapoints

Unnamed: 0,site,column,id,description,unit,pv
0,Synthetic Site,SRRL BMS Dew Point Temperature,r:p:dash:r:278f8943-6a199bd7 SRRL BMS Dew Poin...,SRRL BMS Dew Point Temperature,Â°F,False
1,Synthetic Site,SRRL BMS Diffuse Horizontal Irradiance,r:p:dash:r:278f79c0-03da5abc SRRL BMS Diffuse ...,SRRL BMS Diffuse Horizontal Irradiance,W/mÂ²_irr,False
2,Synthetic Site,SRRL BMS Direct Normal Irradiance,r:p:dash:r:278f79c0-4fe536b4 SRRL BMS Direct N...,SRRL BMS Direct Normal Irradiance,W/mÂ²_irr,False
3,Synthetic Site,SRRL BMS Dry Bulb Temperature,r:p:dash:r:278f79c0-8a8f26b0 SRRL BMS Dry Bulb...,SRRL BMS Dry Bulb Temperature,Â°F,False
4,Synthetic Site,SRRL BMS Global Horizontal Irradiance,r:p:dash:r:278f79c0-8d722bdb SRRL BMS Global H...,SRRL BMS Global Horizontal Irradiance,W/mÂ²_irr,False
5,Synthetic Site,SRRL BMS Relative Humidity,r:p:dash:r:278f79c0-e7db4305 SRRL BMS Relative...,SRRL BMS Relative Humidity,%RH,False
6,Synthetic Site,SRRL BMS Wind Speed at 19',r:p:dash:r:278f79c0-c1f48e5c SRRL BMS Wind Spe...,SRRL BMS Wind Speed at 19',mph,False
7,Synthetic Site,Synthetic Site Electricity Main Total Power,r:p:dash:r:294fd256-a17bb5c7 Synthetic Site El...,Synthetic Site Electricity Main Total Power,kW,False


In [5]:
configs

{'building': 'Synthetic Site',
 'target_var': 'Synthetic Site Electricity Main Total Power',
 'start_year': 2018,
 'start_month': 1,
 'start_day': 1,
 'end_year': 2021,
 'end_month': 12,
 'end_day': 31,
 'data_time_interval_mins': 1,
 'weather_include': [],
 'arch_version': 4,
 'exp_id': '7-9-21',
 'arch_type': 'RNN',
 'arch_type_variant': 'lstm',
 'preprocess': False,
 'fetch_n_parse': False,
 'transformation_method': 'minmaxscale',
 'train_batch_size': 26,
 'val_batch_size': 1,
 'convert_csvs': False,
 'exp_dir': 'exp_dir',
 'data_dir': 'data',
 'resample_freq': 15,
 'sequence_freq_min': 15,
 'splicer': {'active': False, 'time': '12hr'},
 'rolling_window': {'active': True, 'type': 'binned', 'minutes': 15},
 'window': 24,
 'EC_future_gap_min': 0,
 'DOW': ['binary_reg'],
 'MOY': ['sincos'],
 'HOD': ['sincos'],
 'Holidays': False,
 'S2S_stagger': {'initial_num': 72, 'decay': 0, 'secondary_num': 0},
 'train_size_factor': 1,
 'train_val_split': 'Random',
 'data_split': '80:10:10',
 'rando