In [1]:
import json 
from pathlib import Path
import shutil

from wattile.data_reading import _get_dataset_config

PROJECT_DIRECTORY = Path().resolve().parent.parent

# read configs

In [2]:
"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
##################################################################################
# choose the configs file to use as an input
##################################################################################
# main configs file
with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
    configs = json.load(f)
##################################################################################
# code testing configs file
# with open(PROJECT_DIRECTORY / "tests" / "fixtures" / "test_configs.json", "r") as f:
#     configs = json.load(f)
##################################################################################

configs["data_dir"] = str(PROJECT_DIRECTORY) + "/data/Synthetic Site"

configs

{'target_var': 'Synthetic Site Electricity Main Total Power',
 'start_time': '2018-01-01T00:00:00-07:00',
 'end_time': '2022-01-01T00:00:00-07:00',
 'predictor_columns': ['Synthetic Weather Station Dew Point Temperature',
  'Synthetic Weather Station Diffuse Horizontal Irradiance',
  'Synthetic Weather Station Direct Normal Irradiance',
  'Synthetic Weather Station Dry Bulb Temperature',
  'Synthetic Weather Station Global Horizontal Irradiance',
  'Synthetic Weather Station Relative Humidity',
  'Synthetic Weather Station Wind Speed'],
 'arch_version': 'alfa',
 'exp_id': '7-9-21',
 'arch_type': 'RNN',
 'arch_type_variant': 'lstm',
 'transformation_method': 'minmaxscale',
 'train_batch_size': 26,
 'val_batch_size': 1,
 'convert_csvs': False,
 'exp_dir': 'exp_dir',
 'data_dir': 'C:\\Users\\JKIM4\\Documents\\GitHub\\intelligentcampus-pred-analytics/data/Synthetic Site',
 'data_config': 'Synthetic Site Config.json',
 'resample_freq': 15,
 'resample_interval': '15min',
 'sequential_splicer

# Testing

In [3]:
import datetime as dt
import pandas as pd
import os
import logging
logger = logging.getLogger(str(os.getpid()))

In [4]:
df_inputdata = _get_dataset_config(configs)
df_inputdata

Unnamed: 0,filename,contentType,start,end,path
0,Synthetic Site Targets 2021-12-01.csv,targets,2021-12-01 07:00:00+00:00,2021-12-02 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
1,Synthetic Site Targets 2021-12-02.csv,targets,2021-12-02 07:00:00+00:00,2021-12-03 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
2,Synthetic Site Targets 2021-12-03.csv,targets,2021-12-03 07:00:00+00:00,2021-12-04 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
3,Synthetic Site Targets 2021-12-04.csv,targets,2021-12-04 07:00:00+00:00,2021-12-05 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
4,Synthetic Site Targets 2021-12-05.csv,targets,2021-12-05 07:00:00+00:00,2021-12-06 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
5,Synthetic Site Targets 2021-12-06.csv,targets,2021-12-06 07:00:00+00:00,2021-12-07 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
6,Synthetic Site Targets 2021-12-07.csv,targets,2021-12-07 07:00:00+00:00,2021-12-08 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
7,Synthetic Site Predictors 2021-12-01.csv,predictors,2021-12-01 07:00:00+00:00,2021-12-02 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
8,Synthetic Site Predictors 2021-12-02.csv,predictors,2021-12-02 07:00:00+00:00,2021-12-03 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
9,Synthetic Site Predictors 2021-12-03.csv,predictors,2021-12-03 07:00:00+00:00,2021-12-04 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...


In [5]:

# only read from files that's timespan intersects with the configs
# the extra will be removed in `prep_for_rnn`
timestamp_start = dt.datetime.fromisoformat(configs["start_time"])
timestamp_end = dt.datetime.fromisoformat(configs["end_time"])
df_inputdata = df_inputdata.loc[
    (df_inputdata.start <= timestamp_end) & (df_inputdata.end >= timestamp_start), :
]

if df_inputdata.empty:
    logger.info(
        "Pre-process: measurements during the specified time period "
        f"({timestamp_start} to {timestamp_end}) are empty."
    )

    raise ConfigsError("No datapoints found in dataset for specified timeframe.")


In [6]:
df_inputdata

Unnamed: 0,filename,contentType,start,end,path
0,Synthetic Site Targets 2021-12-01.csv,targets,2021-12-01 07:00:00+00:00,2021-12-02 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
1,Synthetic Site Targets 2021-12-02.csv,targets,2021-12-02 07:00:00+00:00,2021-12-03 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
2,Synthetic Site Targets 2021-12-03.csv,targets,2021-12-03 07:00:00+00:00,2021-12-04 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
3,Synthetic Site Targets 2021-12-04.csv,targets,2021-12-04 07:00:00+00:00,2021-12-05 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
4,Synthetic Site Targets 2021-12-05.csv,targets,2021-12-05 07:00:00+00:00,2021-12-06 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
5,Synthetic Site Targets 2021-12-06.csv,targets,2021-12-06 07:00:00+00:00,2021-12-07 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
6,Synthetic Site Targets 2021-12-07.csv,targets,2021-12-07 07:00:00+00:00,2021-12-08 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
7,Synthetic Site Predictors 2021-12-01.csv,predictors,2021-12-01 07:00:00+00:00,2021-12-02 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
8,Synthetic Site Predictors 2021-12-02.csv,predictors,2021-12-02 07:00:00+00:00,2021-12-03 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
9,Synthetic Site Predictors 2021-12-03.csv,predictors,2021-12-03 07:00:00+00:00,2021-12-04 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...


In [7]:
# read in predictor data
predictor_data_info = df_inputdata[df_inputdata.contentType == "predictors"]
predictor_data_info

Unnamed: 0,filename,contentType,start,end,path
7,Synthetic Site Predictors 2021-12-01.csv,predictors,2021-12-01 07:00:00+00:00,2021-12-02 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
8,Synthetic Site Predictors 2021-12-02.csv,predictors,2021-12-02 07:00:00+00:00,2021-12-03 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
9,Synthetic Site Predictors 2021-12-03.csv,predictors,2021-12-03 07:00:00+00:00,2021-12-04 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
10,Synthetic Site Predictors 2021-12-04.csv,predictors,2021-12-04 07:00:00+00:00,2021-12-05 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
11,Synthetic Site Predictors 2021-12-05.csv,predictors,2021-12-05 07:00:00+00:00,2021-12-06 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
12,Synthetic Site Predictors 2021-12-06.csv,predictors,2021-12-06 07:00:00+00:00,2021-12-07 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...
13,Synthetic Site Predictors 2021-12-07.csv,predictors,2021-12-07 07:00:00+00:00,2021-12-08 07:00:00+00:00,C:\Users\JKIM4\Documents\GitHub\intelligentcam...


In [8]:
def _concat_data_from_files(filepaths, needed_columns):
    """Concat the data in the files

    Only get the needed columns.
    Data must include column "Timestamp".

    :param filepaths: list of filepaths
    :type filepaths: list[Path]
    :param needed_columns: list of column names to keep
    :type needed_columns: list[str]
    :return: full data
    :rtype: pd.DataFrame
    """
    full_data = pd.DataFrame()

    for filepaths in filepaths:
        try:
            if len(needed_columns) == 0:
                data = pd.read_csv(Path(filepaths))
            else:
                data = pd.read_csv(Path(filepaths))[["Timestamp"] + needed_columns]
            full_data = pd.concat([full_data, data])

        except Exception:
            logger.warning(f"Could not read {filepaths}. skipping...")
        else:
            logger.info(f"Read {filepaths} and added to data ...")

    if not full_data.empty:
        full_data["Timestamp"] = full_data["Timestamp"].str.split(" ", 1).str[0]
        full_data["Timestamp"] = pd.to_datetime(
            full_data["Timestamp"], format="%Y-%m-%dT%H:%M:%S%z", exact=False, utc=True
        )

        full_data = full_data.set_index("Timestamp")

    return full_data

In [9]:
configs["predictor_columns"] = []

In [10]:
# read in predictor data
predictor_data_info = df_inputdata[df_inputdata.contentType == "predictors"]
data_full_p = _concat_data_from_files(
    predictor_data_info.path,
    needed_columns=configs["predictor_columns"],
)

data_full_p

Unnamed: 0_level_0,Synthetic Weather Station Dew Point Temperature,Synthetic Weather Station Diffuse Horizontal Irradiance,Synthetic Weather Station Direct Normal Irradiance,Synthetic Weather Station Dry Bulb Temperature,Synthetic Weather Station Global Horizontal Irradiance,Synthetic Weather Station Relative Humidity,Synthetic Weather Station Wind Speed
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-12-01 07:00:00+00:00,15.413733,-1.525850,-0.005199,56.408001,-1.810825,17.930000,10.457981
2021-12-01 07:01:00+00:00,15.183906,-1.496226,0.207964,56.174000,-1.841866,17.879999,11.265539
2021-12-01 07:02:00+00:00,14.891221,-1.441191,0.457520,55.886002,-1.873579,17.809999,12.777752
2021-12-01 07:03:00+00:00,14.836164,-1.371754,0.473117,55.796001,-1.860084,17.820000,12.694983
2021-12-01 07:04:00+00:00,14.782966,-1.317349,0.545903,55.723999,-1.843271,17.820000,11.632407
...,...,...,...,...,...,...,...
2021-12-08 06:55:00+00:00,11.210565,-1.487212,-0.254757,42.285198,-1.814939,24.680000,0.000000
2021-12-08 06:56:00+00:00,11.190062,-1.445182,-0.233960,41.997200,-1.852901,24.930000,0.000000
2021-12-08 06:57:00+00:00,11.223961,-1.396302,-0.145575,41.669601,-1.841248,25.290001,3.545647
2021-12-08 06:58:00+00:00,11.337669,-1.335073,0.171570,41.180000,-1.866031,25.920000,1.386941
