In [None]:
import json 
from pathlib import Path
import shutil

from wattile.entry_point import init_logging, create_input_dataframe, run_model
from wattile.data_reading import read_dataset_from_file
from wattile.buildings_processing import prep_for_rnn, rolling_stats, pad_full_data, input_data_split
from wattile.entry_point import run_model

PROJECT_DIRECTORY = Path().resolve().parent.parent

# read configs

In [None]:

"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
    configs = json.load(f)

exp_dir = PROJECT_DIRECTORY / "notebooks" / "exp_dir"
if exp_dir.exists():
    shutil.rmtree(exp_dir)
exp_dir.mkdir()

configs["exp_dir"] = str(exp_dir)
configs["data_dir"] = str(PROJECT_DIRECTORY / "data")

configs

# read data

In [None]:
datatype = "incomplete small example data2" # complete example data, incomplete example data, incomplete small example data
incompleteness = True
# col_test = ['Synthetic Weather Station Direct Normal Irradiance']
col_test = []

In [None]:
import pandas as pd
from pandas.tseries.frequencies import to_offset
import numpy as np
import plotly.graph_objects as go

In [None]:
if datatype == "complete example data":
    """
    Firstly, we will read the raw data from the dataset. 
    Checkout the docs for an indepth explaination of necessary dataset structure.
    """
    data = read_dataset_from_file(configs)
    data
    
    if incompleteness == True:
        
        # data_temp = data.loc["2021-12-01":"2021-12-01" :,].copy()
        data_temp = data.copy()
        data_temp

        # adding irregular measurement intervals
        list_cols = ['Synthetic Weather Station Dew Point Temperature', 'Synthetic Weather Station Diffuse Horizontal Irradiance', 'Synthetic Weather Station Global Horizontal Irradiance']
        list_interval_mins = [3, 5, 7]
        list_timeshift_mins = [0, 3, 7]
        
        i=0
    
        for col, timestep, loffset in zip(list_cols, list_interval_mins, list_timeshift_mins):

            print("resampling and shifting column = {} with resampling timestep of {} and time-shift of {}".format(col, timestep, loffset))

            minutes = str(timestep) + "T"
            loffset = str(loffset) + "min" 
            df_temp = data_temp[col].resample(minutes).mean().copy()
            df_temp.index = df_temp.index + to_offset(loffset)
            data_temp[col] = df_temp

        # adding NaNs in random places
        fraction = 0.1
        list_index_random = data_temp.sample(frac=fraction, replace=False, random_state=1).index.tolist()
        list_column_random = pd.DataFrame(data_temp.columns).sample(frac=fraction, replace=False, random_state=2).iloc[:,0].tolist()

        i=0
        for ind in list_index_random:

            for col in list_column_random:

                #print("replacing value in index = {} and column = {} to blank".format(ind, col))
                data_temp.loc[ data_temp.index==ind , data_temp.columns==col ] = np.NAN
                
        # adding irregular/random timestamps
        def random_dates(start, end, n):

            start_u = start.value//10**9
            end_u = end.value//10**9

            return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
        
        np.random.seed(seed=1)
        start = data_temp.index[0]
        end = data_temp.index[-1]
        n = data_temp.shape[0]
        datetime_random = random_dates(start, end, n)
        datetime_random = datetime_random.sort_values()
        datetime_random
        data_temp.index = datetime_random
        
        if col_test==[]:
            data_test = data_temp.copy()
        else:
            data_test = data_temp.loc[:, data_temp.columns.isin(col_test)]
            
elif datatype == "incomplete small example data1":

    data_test = [
        [
            "01:00:00",
            "01:01:53",
            "01:03:17",
            "01:04:02",
            "01:04:59",
            "01:05:00",
            "01:06:22",
            "01:09:46",
            "01:10:00",
            "01:11:22",
            "01:13:44",
            "01:14:26",
            "01:15:00"
        ],
        [np.nan, 1.5, 2.2, 0.9, 3.6, np.nan, 3.3, 2.3, np.nan, 1.3, 4.3, 4.1, np.nan],
        [1.0, np.nan, np.nan, np.nan, np.nan, 2.0, np.nan, np.nan, 3.0, np.nan, np.nan, np.nan, 4.0]
    ]

    data_test = pd.DataFrame(data_test).T
    data_test.columns = ['ts', 'var1', 'var2']
    data_test['var1'] = data_test['var1'].astype(float)
    data_test['var2'] = data_test['var2'].astype(float)
    data_test['ts'] = pd.to_datetime(data_test.ts)
    data_test = data_test.set_index('ts')
    
elif datatype == "incomplete small example data2":
    data_test = pd.read_csv(
        "../../tests/fixtures/rolling_stats_input.csv", 
        index_col=0,
    )
    data_test['var1'] = pd.to_numeric(data_test['var1'], errors='coerce')
    data_test['var2'] = pd.to_numeric(data_test['var2'], errors='coerce')
    data_test['var1'] = data_test['var1'].astype(float)
    data_test['var2'] = data_test['var2'].astype(float)
    data_test.index = pd.to_datetime(data_test.index, exact=False, utc=True)
    data_test = data_test[['var1','var2']]
    
data_test

# setting resampling configuration

- default is,
- right labeled window resampling
- right-closed window
- backward-looking rolling window (this happens in rolling_stats method)

In [None]:
configs["resample"] = {}

# settings to put in configs
configs["resample"]["interval"] = "1min"

# settings to hard-code for now
configs["resample"]["label_on"] = "right"

fig = go.Figure()

In [None]:
list_color = ['rgb(241,163,64)','rgb(153,142,195)']
i_clr = 0
for col in data_test.columns:
    
    fig.add_trace(go.Scatter(
        x=data_test.index.values,
        y=data_test[col].values,
        mode='markers',
        marker=dict(
            size=15,
            color=list_color[i_clr]
            ),
        name="raw: {}".format(col)
    ))
    i_clr+=1

# test code

In [None]:
def resample_data(data, configs, method, symbol):

    data = data_test.copy()

    ####################################################################
    # resampling for each statistics separately
    i_clr=0

    if method == "asfreq_none":
        
        data_resampled = data.asfreq(
            freq=configs["resample"]["interval"],
            method=None
        )
        
    if method == "asfreq_bfill":
        
        data_resampled = data.asfreq(
            freq=configs["resample"]["interval"],
            method="bfill"
        )
        
    elif method == "asfreq_ffill":
        
        data_resampled = data.asfreq(
            freq=configs["resample"]["interval"],
            method="ffill"
        )
        
    elif method == "resample_left-labeled_first":
        
        data_resampled = data.resample(
            rule=configs["resample"]["interval"],
            label="left",
            closed="left"
        ).first()
        
    elif method == "resample_right-labeled_last":
        
        data_resampled = data.resample(
            rule=configs["resample"]["interval"],
            label="right",
            closed="right"
        ).last()
        
    for df in [data_resampled]:
        i_col=0
        for col in df.columns:
            fig.add_trace(go.Scatter(
                x=df.index.values,
                y=df[col].values,
                mode="markers",
                marker=dict(
                    symbol=symbol,
                    color=list_color[i_col],
                    size=14,
                    line=dict(
                        width=2,
                        #color=list_color[i_col],
                    ),
                ),
                name="resample ({}): {}".format(method, col)
            ))
            i_col+=1
    i_clr+=1
    
    return data_resampled

# trying different resampling methods with test code

In [None]:
data_resampled = resample_data(data_test, configs, "asfreq_none", "x-dot")
data_resampled = resample_data(data_test, configs, "asfreq_bfill", "cross-dot")
data_resampled = resample_data(data_test, configs, "asfreq_ffill", "triangle-up-dot")
data_resampled = resample_data(data_test, configs, "resample_left-labeled_first", "star-dot")
data_resampled = resample_data(data_test, configs, "resample_right-labeled_last", "hash-dot")

In [None]:
fig.update_layout(
    width=900,
    height=600,
#     title=dict(
#         text="window_width = {}<br>window_increment = {}<br>window_position = {}<br>window_closing = {}".format(
#             configs["feat_stats"]["window_width"],
#             configs["feat_stats"]["window_increment"],
#             configs["feat_stats"]["window_position"],
#             configs["feat_stats"]["window_closing"]
#         ),
#         x=0.025,
#         xanchor='left',
#         y=0.975,
#         yanchor='top',
#         font_size=15,
#     ),
    margin=dict(
        l=0,
        r=0,
        t=0,
        b=150,
    ),
    legend=dict(
        orientation="h",
        yanchor="top",
        y=-0.15,
        xanchor="center",
        x=0.5,
        font=dict(
            size=10,
            color="black",
        ),
    )
)

fig.update_xaxes(
    dtick=1000*60,
    showgrid=True,
    gridwidth=2, 
)

fig.update_yaxes(
    range=[-0.1, 5],
    showgrid=False,
)

fig.write_html("./testing_data_resampling.html")

fig.show()

# cleaned version for wattile implementation

- so, `rolling_stats` method will take `True`/`False` argument to whether do rolling stats or not
- testing done from this notebook is added in the `else` statement below
- so, this is work is basically updating `rolling_stats` method

In [None]:
def _resample_data(data, configs):

    # reading configuration parameters.
    # resample_label_on are hard coded for now. default is right labeled and right-closed window.
    resample_interval = configs["resample_interval"]
    resample_label_on = "right"  # left, right

    # resample data
    if resample_label_on == "left":
        data = data.resample(
            rule=resample_interval, label=resample_label_on, closed="left"
        ).first()
    elif resample_label_on == "right":
        data = data.resample(
            rule=resample_interval, label=resample_label_on, closed="right"
        ).last()

    return data

In [None]:
def rolling_stats(data, configs):

    # reading configuration parameters.
    # resample_label_on are hard coded for now. default is right labeled and right-closed window.
    # window_closing and window_position are hard coded for now. default is right-closed and backward-looking window.
    resample_interval = configs["resample_interval"]
    resample_label_on = "right"  # left, right
    window_width = configs["feat_stats"]["window_width"]
    window_closing = "right"  # left, right
    window_position = "backward"  # forward, center, backward

    if configs["feat_stats"]["active"]:

        # seperate predictors and target
        target = data[configs["target_var"]]
        X_data = data.drop(configs["target_var"], axis=1)

        # resampling for each statistics separately
        data_resampler = X_data.resample(
            rule=resample_interval, closed=window_closing, label=resample_label_on
        )
        data_resample_min = data_resampler.min().add_suffix("_min")
        data_resample_max = data_resampler.max().add_suffix("_max")
        data_resample_sum = data_resampler.sum().add_suffix("_sum")
        data_resample_count = data_resampler.count().add_suffix("_count")

        # setting configuration settings depending on window_position and window_closing
        if window_position == "backward":
            arg_center = False
        elif window_position == "center":
            arg_center = True
        elif window_position == "forward":
            arg_center = False
            data_resample_min = data_resample_min[::-1]
            data_resample_max = data_resample_max[::-1]
            data_resample_sum = data_resample_sum[::-1]
            data_resample_count = data_resample_count[::-1]
            if window_closing == "left":
                window_closing = "right"
            elif window_closing == "right":
                window_closing = "left"

        # adding rolling window statistics: minimum
        mins = data_resample_min.rolling(
            window=window_width, min_periods=1, center=arg_center, closed=window_closing
        ).min()

        # adding rolling window statistics: maximum
        maxs = data_resample_max.rolling(
            window=window_width, min_periods=1, center=arg_center, closed=window_closing
        ).max()

        # adding rolling window statistics: sum
        sums = data_resample_sum.rolling(
            window=window_width, min_periods=1, center=arg_center, closed=window_closing
        ).sum()

        # adding rolling window statistics: count
        counts = data_resample_count.rolling(
            window=window_width, min_periods=1, center=arg_center, closed=window_closing
        ).sum()  # this has to be sum for proper count calculation

        # adding rolling window statistics: mean
        means = sums.copy()
        means.columns = means.columns.str.replace("_sum", "_mean")
        np.seterr(invalid="ignore")  # supress/hide the warning
        means.loc[:, :] = sums.values / counts.values

        # combining min and max stats
        data = pd.concat([mins, maxs, means], axis=1)

        # reordering dataframe based on window_position
        if window_position == "forward":
            data = data[::-1]

        # adding resampled target back to the dataframe
        target = _resample_data(target, configs)
        data[configs["target_var"]] = target

    else:

        # resample data
        data = _resample_data(data, configs)

    return data

In [None]:
data = read_dataset_from_file(configs)
data

In [None]:
prep_for_rnn(configs, data)