In [None]:
import json 
from pathlib import Path
import shutil

from wattile.entry_point import init_logging, create_input_dataframe, run_model
from wattile.data_reading import read_dataset_from_file
from wattile.buildings_processing import correct_predictor_columns, correct_timestamps, rolling_stats, input_data_split
from wattile.time_processing import add_processed_time_columns
from wattile.entry_point import run_model


PROJECT_DIRECTORY = Path().resolve().parent.parent

# read configs

In [None]:
"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
##################################################################################
# choose the configs file to use as an input
##################################################################################
# # main configs file
# with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
#     configs = json.load(f)
##################################################################################
# code testing configs file
with open(PROJECT_DIRECTORY / "tests" / "fixtures" / "test_configs.json", "r") as f:
    configs = json.load(f)
##################################################################################

exp_dir = PROJECT_DIRECTORY / "notebooks" / "exp_dir"
if exp_dir.exists():
    shutil.rmtree(exp_dir)
exp_dir.mkdir()

configs["exp_dir"] = str(exp_dir)
configs["data_dir"] = str(PROJECT_DIRECTORY / "data")

configs

# read input data

In [None]:
datatype = "complete example data" # complete example data, incomplete example data, incomplete small example data
incompleteness = True
# col_test = ['Synthetic Weather Station Direct Normal Irradiance']
col_test = []

In [None]:
import pandas as pd
from pandas.tseries.frequencies import to_offset
import numpy as np
import plotly.graph_objects as go

In [None]:
if datatype == "complete example data":
    """
    Firstly, we will read the raw data from the dataset. 
    Checkout the docs for an indepth explaination of necessary dataset structure.
    """
    data = read_dataset_from_file(configs)
    data
    
    if incompleteness == True:
        
        # data_temp = data.loc["2021-12-01":"2021-12-01" :,].copy()
        data_temp = data.copy()
        data_temp

        # adding irregular measurement intervals
        list_cols = ['Synthetic Weather Station Dew Point Temperature', 'Synthetic Weather Station Diffuse Horizontal Irradiance', 'Synthetic Weather Station Global Horizontal Irradiance']
        list_interval_mins = [3, 5, 7]
        list_timeshift_mins = [0, 3, 7]
        
        i=0
    
        for col, timestep, loffset in zip(list_cols, list_interval_mins, list_timeshift_mins):

            print("resampling and shifting column = {} with resampling timestep of {} and time-shift of {}".format(col, timestep, loffset))

            minutes = str(timestep) + "T"
            loffset = str(loffset) + "min" 
            df_temp = data_temp[col].resample(minutes).mean().copy()
            df_temp.index = df_temp.index + to_offset(loffset)
            data_temp[col] = df_temp

        # adding NaNs in random places
        fraction = 0.1
        list_index_random = data_temp.sample(frac=fraction, replace=False, random_state=1).index.tolist()
        list_column_random = pd.DataFrame(data_temp.columns).sample(frac=fraction, replace=False, random_state=2).iloc[:,0].tolist()

        i=0
        for ind in list_index_random:

            for col in list_column_random:

                #print("replacing value in index = {} and column = {} to blank".format(ind, col))
                data_temp.loc[ data_temp.index==ind , data_temp.columns==col ] = np.NAN
                
        # adding irregular/random timestamps
        def random_dates(start, end, n):

            start_u = start.value//10**9
            end_u = end.value//10**9

            return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
        
        np.random.seed(seed=1)
        start = data_temp.index[0]
        end = data_temp.index[-1]
        n = data_temp.shape[0]
        datetime_random = random_dates(start, end, n)
        datetime_random = datetime_random.sort_values()
        datetime_random
        data_temp.index = datetime_random
        
        if col_test==[]:
            data_test = data_temp.copy()
        else:
            data_test = data_temp.loc[:, data_temp.columns.isin(col_test)]
            
elif datatype == "incomplete small example data1":

    data_test = [
        [
            "01:00:00",
            "01:01:53",
            "01:03:17",
            "01:04:02",
            "01:04:59",
            "01:05:00",
            "01:06:22",
            "01:09:46",
            "01:10:00",
            "01:11:22",
            "01:13:44",
            "01:14:26",
            "01:15:00"
        ],
        [np.nan, 1.5, 2.2, 0.9, 3.6, np.nan, 3.3, 2.3, np.nan, 1.3, 4.3, 4.1, np.nan],
        [1.0, np.nan, np.nan, np.nan, np.nan, 2.0, np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 4.0]
    ]

    data_test = pd.DataFrame(data_test).T
    data_test.columns = ['ts', 'var1', 'var2']
    data_test['var1'] = data_test['var1'].astype(float)
    data_test['var2'] = data_test['var2'].astype(float)
    data_test['ts'] = pd.to_datetime(data_test.ts)
    data_test = data_test.set_index('ts')
    
elif datatype == "incomplete small example data2":
    data_test = pd.read_csv(
        "../../tests/fixtures/rolling_stats_input.csv", 
        index_col=0,
    )
    data_test['var1'] = pd.to_numeric(data_test['var1'], errors='coerce')
    data_test['var2'] = pd.to_numeric(data_test['var2'], errors='coerce')
    data_test['var1'] = data_test['var1'].astype(float)
    data_test['var2'] = data_test['var2'].astype(float)
    data_test.index = pd.to_datetime(data_test.index, exact=False, utc=True)
    data_test = data_test[['var1','var2']]
    
data_test

# pre-process data including feature extraction

In [None]:
# assert we have the correct columns and order them
data = correct_predictor_columns(configs, data)

In [None]:
# sort and trim data specified time period
data = correct_timestamps(configs, data)

In [None]:
# Add time-based features
data = add_processed_time_columns(data, configs)

In [None]:
data = rolling_stats(data, configs)

In [None]:
data

In [None]:
train_df, val_df = input_data_split(data, configs)

In [None]:
data.shape

In [None]:
train_df.shape

In [None]:
val_df.shape

# test 2D-3D data conversion for S2S

### original version

In [None]:
"""
def generate_windows(data):
    
    # an important procedure to convert 2-dimensional data into 3-dimensional for modeling
    
    x_train = []
    y_usage_train = []
    x_test = []
    y_usage_test = []

    # for training data
    idxs = np.random.choice(
        train_source.shape[0] - (WINDOW_SOURCE_SIZE + WINDOW_TARGET_SIZE),
        train_source.shape[0] - (WINDOW_SOURCE_SIZE + WINDOW_TARGET_SIZE),
        replace=False,
    )

    for idx in idxs:
        x_train.append(
            train_source[idx : idx + WINDOW_SOURCE_SIZE].reshape(
                (1, WINDOW_SOURCE_SIZE, train_source.shape[1])
            )
        )
        y_usage_train.append(
            train_source[
                idx
                + WINDOW_SOURCE_SIZE : idx
                + WINDOW_SOURCE_SIZE
                + WINDOW_TARGET_SIZE,
                -1,
            ].reshape((1, WINDOW_TARGET_SIZE, 1))
        )

    x_train = np.concatenate(x_train, axis=0)  # make them arrays and not lists
    y_usage_train = np.concatenate(y_usage_train, axis=0)

    # for testing data
    idxs = np.arange(
        0,
        len(test_source) - (WINDOW_SOURCE_SIZE + WINDOW_TARGET_SIZE),
        WINDOW_TARGET_SIZE,
    )

    for idx in idxs:
        x_test.append(
            test_source[idx : idx + WINDOW_SOURCE_SIZE].reshape(
                (1, WINDOW_SOURCE_SIZE, test_source.shape[1])
            )
        )
        y_usage_test.append(
            test_source[
                idx
                + WINDOW_SOURCE_SIZE : idx
                + WINDOW_SOURCE_SIZE
                + WINDOW_TARGET_SIZE,
                -1,
            ].reshape((1, WINDOW_TARGET_SIZE, 1))
        )

    x_test = np.concatenate(x_test, axis=0)  # make them arrays and not lists
    y_usage_test = np.concatenate(y_usage_test, axis=0)

    return x_train, y_usage_train, x_test, y_usage_test
"""

### updated version

In [None]:
window_source_size = 12
window_target_size = 2

In [None]:
train_df = train_df.values
val_df = val_df.values

In [None]:
train_predictor = []
train_target = []
valid_predictor = []
valid_target = []

# for training data
idxs = np.random.choice(
    train_df.shape[0] - (window_source_size + window_target_size),
    train_df.shape[0] - (window_source_size + window_target_size),
    replace=False,
)

In [None]:
idxs

In [None]:
for idx in idxs:
    train_predictor.append(
        train_df[idx : idx + window_source_size].reshape(
            (1, window_source_size, train_df.shape[1])
        )
    )
    train_target.append(
        train_df[
            idx
            + window_source_size : idx
            + window_source_size
            + window_target_size,
            -1,
        ].reshape((1, window_target_size, 1))
    )

In [None]:
type(train_predictor)

In [None]:
type(train_target)

In [None]:
train_predictor = np.concatenate(train_predictor, axis=0)  # make them arrays and not lists
train_target = np.concatenate(train_target, axis=0)

In [None]:
type(train_predictor)

In [None]:
type(train_target)

In [None]:
# for validation data
idxs = np.arange(
    0,
    len(val_df) - (window_source_size + window_target_size),
    window_target_size,
)

In [None]:
idxs

In [None]:
for idx in idxs:
    valid_predictor.append(
        val_df[idx : idx + window_source_size].reshape(
            (1, window_source_size, val_df.shape[1])
        )
    )
    valid_target.append(
        val_df[
            idx
            + window_source_size : idx
            + window_source_size
            + window_target_size,
            -1,
        ].reshape((1, window_target_size, 1))
    )

In [None]:
type(valid_predictor)

In [None]:
type(valid_target)

In [None]:
valid_predictor = np.concatenate(valid_predictor, axis=0)  # make them arrays and not lists
valid_target = np.concatenate(valid_target, axis=0)

In [None]:
type(valid_predictor)

In [None]:
type(valid_target)

# cleaned version

In [None]:
def pad_full_data_s2s(train_df, val_df, configs):
    
    # reading configuration settings
    window_source_size = 12 # TODO: replace this with configs param
    window_target_size = 2 # TODO: replace this with configs param
    
    train_df = train_df.values
    val_df = val_df.values
    
    # initialize lists
    train_predictor = []
    train_target = []
    valid_predictor = []
    valid_target = []

    # create rolling window data for both predictor and target and for training data set
    idxs = np.random.choice(
        train_df.shape[0] - (window_source_size + window_target_size),
        train_df.shape[0] - (window_source_size + window_target_size),
        replace=False,
    )
    for idx in idxs:
        train_predictor.append(
            train_df[idx : idx + window_source_size].reshape(
                (1, window_source_size, train_df.shape[1])
            )
        )
        train_target.append(
            train_df[
                idx
                + window_source_size : idx
                + window_source_size
                + window_target_size,
                -1,
            ].reshape((1, window_target_size, 1))
        )
    # convert to numpy array
    train_predictor = np.concatenate(train_predictor, axis=0)
    train_target = np.concatenate(train_target, axis=0)
    
    # create rolling window data for both predictor and target and for validation data set
    idxs = np.arange(
        0,
        len(val_df) - (window_source_size + window_target_size),
        window_target_size,
    )
    for idx in idxs:
        valid_predictor.append(
            val_df[idx : idx + window_source_size].reshape(
                (1, window_source_size, val_df.shape[1])
            )
        )
        valid_target.append(
            val_df[
                idx
                + window_source_size : idx
                + window_source_size
                + window_target_size,
                -1,
            ].reshape((1, window_target_size, 1))
        )
    # convert to numpy array
    valid_predictor = np.concatenate(valid_predictor, axis=0)  # make them arrays and not lists
    valid_target = np.concatenate(valid_target, axis=0)
    
    return train_predictor, train_target, valid_predictor, valid_target