In [None]:
import numpy as np
import pandas as pd
import random
import time
import math
import os
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import init
from dateutil import parser
from pathlib import Path
import json 
import shutil
import logging
logger = logging.getLogger(str(os.getpid()))

from wattile.data_reading import read_dataset_from_file
from wattile.buildings_processing import correct_predictor_columns, correct_timestamps, resample_or_rolling_stats, timelag_predictors, timelag_predictors_target, roll_predictors_target
from wattile.time_processing import add_processed_time_columns
PROJECT_DIRECTORY = Path().resolve().parent.parent

# reading configs

In [None]:
"""
For this example, we will be using the default configs.
Check out the docs for an explaination of each config.
"""
##################################################################################
# choose the configs file to use as an input
##################################################################################
# main configs file
with open(PROJECT_DIRECTORY / "wattile" / "configs" / "configs.json", "r") as f:
    configs = json.load(f)
##################################################################################
# code testing configs file
# with open(PROJECT_DIRECTORY / "tests" / "fixtures" / "test_configs.json", "r") as f:
#     configs = json.load(f)
##################################################################################

exp_dir = PROJECT_DIRECTORY / "notebooks" / "exp_dir"
if exp_dir.exists():
    shutil.rmtree(exp_dir)
exp_dir.mkdir()

configs["data_input"]["exp_dir"] = str(PROJECT_DIRECTORY / exp_dir)
##################################################################################
configs["data_input"]["data_dir"] = str(PROJECT_DIRECTORY / "data" / "Synthetic Site")
##################################################################################
# configs["data_input"]["data_dir"] = str(PROJECT_DIRECTORY / ".." / "intelligentcampus-feature-eng" / "data" / "Cafe")
# configs["data_input"]["data_config"] = "Cafe Config.json"
# configs["data_input"]["predictor_columns"] = [
#     "SRRL BMS Atmospheric Electric Field",
#     "SRRL BMS Barometric Pressure",
#     "SRRL BMS Dew Point Temperature",
#     "SRRL BMS Diffuse Horizontal Irradiance",
#     "SRRL BMS Direct Normal Irradiance",
#     "SRRL BMS Dry Bulb Temperature",
#     "SRRL BMS DWIR",
# #     "SRRL BMS Global 40Â° South Irradiance",
# #     "SRRL BMS Global 90Â° South Irradiance",
# #     "SRRL BMS Global Horizontal Irradiance",
# #     "SRRL BMS Global Illuminance",
#     "SRRL BMS Global Normal Irradiance",
# #     "SRRL BMS Global UV Index",
#     "SRRL BMS Opaque Cloud Cover",
# #     "SRRL BMS Peak Wind Speed at 19'",
# #     "SRRL BMS Peak Wind Speed at 6'",
# #     "SRRL BMS Rainfall",
#     "SRRL BMS Relative Humidity",
# #     "SRRL BMS Snow Depth",
#     "SRRL BMS Total Cloud Cover",
#     "SRRL BMS UWIR",
#     "SRRL BMS UWSW",
#     "SRRL BMS Wet Bulb Temperature",
# #     "SRRL BMS Wind Direction at 19'",
# #     "SRRL BMS Wind Direction at 6'",
# #     "SRRL BMS Wind Speed at 19'",
# #     "SRRL BMS Wind Speed at 6'",
# ]
# configs["data_input"]["target_var"] = "Cafe Whole Building Real Power Total"
##################################################################################

configs

# reading data

In [None]:
configs["target_feat_name"] = [configs["data_input"]["target_var"]]
data = read_dataset_from_file(configs)
data

# plot setting

In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.colors import n_colors
from plotly.validators.scatter.marker import SymbolValidator
import plotly.io as pio
os.environ['path'] += r';C:/Users/JKIM4/Downloads/vips-dev-w64-all-8.11.0/vips-dev-8.11/bin'
import pyvips
import copy
import random

random.seed(1)
path = "../../tests/fixtures"
colorscale = "Earth"
colorscale = ["rgb(238,221,136)","rgb(68,187,153)","rgb(153,221,255)","rgb(238,136,102)","rgb(119,170,221)","rgb(170,170,0)"]

# data processing 1

In [None]:
# Add time-based features
data = add_processed_time_columns(data, configs)

data

### feat_time paramters

In [None]:
list_line = ["solid", "dot", "dash", "longdash", "dashdot", "longdashdot"]

n_colors = len(list_line)
list_colors = px.colors.sample_colorscale(colorscale, [n/(n_colors -1) for n in range(n_colors)])
random.shuffle(copy.deepcopy(list_colors))

df_feat_time = data.loc[:, data.columns.str.contains("MOY|DOW|HOD")]
df_feat_time

dict_daterange = {
    "HOD":["2019-12-02 00:00:00+00:00","2019-12-03 00:00:00+00:00"],
    "DOW":["2019-12-02 00:00:00+00:00","2019-12-10 00:00:00+00:00"],
    "MOY":["2019-01-01 00:00:00+00:00","2020-01-01 00:00:00+00:00"]
}

dict_feat_include = {
    "HOD":[
        "sin_HOD",
        "cos_HOD",
        "HOD_binary_reg_10",
        "HOD_binary_reg_20",
        "HOD_binary_fuzzy_13",
        "HOD_binary_fuzzy_23"
    ],
    "DOW":[
        "DOW_binary_reg_0",
        "DOW_binary_reg_2",
        "DOW_binary_reg_4",
        "DOW_binary_fuzzy_1",
        "DOW_binary_fuzzy_3",
        "DOW_binary_fuzzy_5",
    ],
    "MOY":[
        "sin_MOY",
        "cos_MOY",
    ],
}

dict_dtick = {
    "HOD":1000*60*60*3,
    "DOW":"D1",
    "MOY":"M3",
}

i=0
for feat in dict_daterange.keys():
    fig = go.Figure()
    
    df_temp = data.loc[dict_daterange[feat][0]:dict_daterange[feat][1], data.columns.str.contains(feat)][dict_feat_include[feat]]
    
    if feat == "DOW":
        df_temp = df_temp.resample("60min").mean()
    
    i_label = 0
    for col in df_temp.columns:
        
        fig.add_trace(go.Scatter(
            mode="lines",
            x=df_temp.index.values,
            y=df_temp[col].values,
            name=col,
            line=dict(
                color=list_colors[i_label],
                dash=list_line[i_label],
            ),
        ))
        
        i_label+=1

    fig.update_layout(
        width=800,
        height=250,
        margin=dict(
            l=0,
            r=0,
            t=30,
            b=0,
        ),
        plot_bgcolor="rgb(255,255,255)",
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1,
            xanchor="center",
            x=0.5,
        )
    )
    
    fig.update_xaxes(
        dtick=dict_dtick[feat],
        showgrid=True,
        gridwidth=0.1, 
        gridcolor="grey"
    )

    filename = "example_feat_time_{}".format(feat)
    pio.write_image(fig, path + "/" + filename + ".svg")
    # https://stackoverflow.com/questions/51450134/how-to-convert-svg-to-png-or-jpeg-in-python
    image = pyvips.Image.thumbnail(path + "/" + filename + ".svg", 3000)
    image.write_to_file(path + "/" + filename + ".png")

    fig.show()
    
#     if i== 1:
#         break
        
    i+=1

### resample paramters

In [None]:
configs["data_input"]["target_var"] = "Synthetic Site Electricity Main Total Power"

configs["data_processing"]["resample"]["bin_interval"] = "3min"
configs["data_processing"]["resample"]["bin_closed"] = "right"
configs["data_processing"]["resample"]["bin_label"] = "right"

configs["data_processing"]["feat_stats"]["active"] = False

In [None]:
df_test = pd.read_csv("../../tests/fixtures/rolling_stats_input_w_target.csv", index_col=0)
df_test.index = pd.to_datetime(df_test.index)
df_test = df_test.loc[:, df_test.columns!="var2"]
df_test

In [None]:
df_resample = resample_or_rolling_stats(df_test, configs)
df_resample = df_resample.add_suffix("_resampled | {} interval | {} closed | {} label".format(
    configs["data_processing"]["resample"]["bin_interval"],
    configs["data_processing"]["resample"]["bin_closed"],
    configs["data_processing"]["resample"]["bin_label"]
))
df_resample

In [None]:
df_test = df_test.loc[:, df_test.columns=="var1"]
df_resample = df_resample.loc[:, df_resample.columns.str.contains("var1")]

In [None]:
list_symbol = ["circle-open-dot", "square-open-dot"]

n_colors = len(list_symbol)
list_colors = px.colors.sample_colorscale(colorscale, [n/(n_colors -1) for n in range(n_colors)])
random.shuffle(copy.deepcopy(list_colors))

fig = go.Figure()
    
i_label = 0
for col in df_test.columns:

    fig.add_trace(go.Scatter(
        mode="markers",
        x=df_test.index.values,
        y=df_test[col].values,
        name=col,
        marker=dict(
            symbol=list_symbol[i_label],
            size=15,
            line_width=2,
        ),
        line=dict(
            color=list_colors[i_label],
        ),
    ))

    i_label+=1
    
for col in df_resample.columns:

    fig.add_trace(go.Scatter(
        mode="markers",
        x=df_resample.index.values,
        y=df_resample[col].values,
        name=col,
        marker=dict(
            symbol=list_symbol[i_label],
            size=10,
            line_width=2,
        ),
        line=dict(
            color=list_colors[i_label],
        ),
    ))

    i_label+=1

fig.update_layout(
    width=800,
    height=250,
    margin=dict(
        l=0,
        r=0,
        t=30,
        b=0,
    ),
    plot_bgcolor="rgb(255,255,255)",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="center",
        x=0.5,
    )
)

fig.update_xaxes(
    range=["2022-07-12 00:59:00+06:00","2022-07-12 01:16:00+06:00"],
    dtick=1000*60,
    showgrid=True,
    gridwidth=0.1, 
    gridcolor="grey",
)

filename = "example_resample_{}-closed_{}-label".format(
    configs["data_processing"]["resample"]["bin_closed"],
    configs["data_processing"]["resample"]["bin_label"]
)
pio.write_image(fig, path + "/" + filename + ".svg")
# https://stackoverflow.com/questions/51450134/how-to-convert-svg-to-png-or-jpeg-in-python
image = pyvips.Image.thumbnail(path + "/" + filename + ".svg", 3000)
image.write_to_file(path + "/" + filename + ".png")

fig.show()

# #     if i== 1:
# #         break

# i+=1

### feat_stats

In [None]:
configs["data_input"]["target_var"] = "Synthetic Site Electricity Main Total Power"

configs["data_processing"]["resample"]["bin_interval"] = "3min"
configs["data_processing"]["resample"]["bin_closed"] = "right"
configs["data_processing"]["resample"]["bin_label"] = "right"

configs["data_processing"]["feat_stats"]["active"] = True
configs["data_processing"]["feat_stats"]["window_width"] = "3min"

In [None]:
df_test = pd.read_csv("../../tests/fixtures/rolling_stats_input_w_target.csv", index_col=0)
df_test.index = pd.to_datetime(df_test.index)
df_test = df_test.loc[:, df_test.columns!="var2"]
df_test

In [None]:
df_resample = resample_or_rolling_stats(df_test, configs)
df_resample = df_resample.add_suffix(" | {} window".format(
    configs["data_processing"]["feat_stats"]["window_width"]
))
df_resample

In [None]:
df_test = df_test.loc[:, df_test.columns=="var1"]
df_resample = df_resample.loc[:, df_resample.columns.str.contains("var1")]

In [None]:
list_symbol = ["circle-open-dot", "square-open-dot", "diamond-open-dot", "x-open-dot"]

n_colors = len(list_symbol)
list_colors = px.colors.sample_colorscale(colorscale, [n/(n_colors -1) for n in range(n_colors)])
random.shuffle(copy.deepcopy(list_colors))

fig = go.Figure()
    
i_label = 0
for col in df_test.columns:

    fig.add_trace(go.Scatter(
        mode="markers",
        x=df_test.index.values,
        y=df_test[col].values,
        name=col,
        marker=dict(
            symbol=list_symbol[i_label],
            size=15,
            line_width=2,
        ),
        line=dict(
            color=list_colors[i_label],
        ),
    ))

    i_label+=1
    
for col in df_resample.columns:

    fig.add_trace(go.Scatter(
        mode="markers",
        x=df_resample.index.values,
        y=df_resample[col].values,
        name=col,
        marker=dict(
            symbol=list_symbol[i_label],
            size=10,
            line_width=2,
        ),
        line=dict(
            color=list_colors[i_label],
        ),
    ))

    i_label+=1

fig.update_layout(
    width=800,
    height=250,
    margin=dict(
        l=0,
        r=0,
        t=30,
        b=0,
    ),
    plot_bgcolor="rgb(255,255,255)",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="center",
        x=0.5,
    )
)

fig.update_xaxes(
    range=["2022-07-12 00:59:00+06:00","2022-07-12 01:16:00+06:00"],
    dtick=1000*60,
    showgrid=True,
    gridwidth=0.1, 
    gridcolor="grey",
)

filename = "example_feat_stats"
pio.write_image(fig, path + "/" + filename + ".svg")
# https://stackoverflow.com/questions/51450134/how-to-convert-svg-to-png-or-jpeg-in-python
image = pyvips.Image.thumbnail(path + "/" + filename + ".svg", 3000)
image.write_to_file(path + "/" + filename + ".png")

fig.show()

# #     if i== 1:
# #         break

# i+=1

### feat_timelag

In [None]:
configs["data_processing"]["feat_timelag"]["lag_interval"] = "60min"
configs["data_processing"]["feat_timelag"]["lag_count"] = 5

In [None]:
timestamp_start = "2021-12-02 00:00:00+00:00"
timestamp_end = "2021-12-03 00:00:00+00:00"

var = "Synthetic Weather Station Dry Bulb Temperature"

data_temp = data.loc[timestamp_start:timestamp_end, (data.columns==var)|(data.columns==configs["data_input"]["target_var"])]
data_temp.columns = ['var1', configs["data_input"]["target_var"]]

In [None]:
data_feat_stats = timelag_predictors(data_temp, configs)
data_feat_stats = data_feat_stats.loc[:, data_feat_stats.columns.str.contains("var1")]
data_feat_stats = data_feat_stats.iloc[:, ::-1]
data_feat_stats

In [None]:
list_line = ["solid", "dot", "dash", "longdash", "dashdot", "longdashdot"]

n_colors = len(list_line)
list_colors = px.colors.sample_colorscale(colorscale, [n/(n_colors -1) for n in range(n_colors)])
random.shuffle(copy.deepcopy(list_colors))

fig = go.Figure()

i_label = 0
for col in data_feat_stats.columns:

    fig.add_trace(go.Scatter(
        mode="lines",
        x=data_feat_stats.index.values,
        y=data_feat_stats[col].values,
        name=col,
        line=dict(
            color=list_colors[i_label],
            dash=list_line[i_label],
        ),
    ))

    i_label+=1

fig.update_layout(
    width=800,
    height=250,
    margin=dict(
        l=0,
        r=0,
        t=30,
        b=0,
    ),
    plot_bgcolor="rgb(255,255,255)",
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1,
        xanchor="center",
        x=0.5,
    )
)

fig.update_xaxes(
    showgrid=True,
    gridwidth=0.1, 
    gridcolor="grey",
)

filename = "example_feat_timelag"
pio.write_image(fig, path + "/" + filename + ".svg")
# https://stackoverflow.com/questions/51450134/how-to-convert-svg-to-png-or-jpeg-in-python
image = pyvips.Image.thumbnail(path + "/" + filename + ".svg", 3000)
image.write_to_file(path + "/" + filename + ".png")

fig.show()

### input_data_split

In [None]:
data_size = 345

print("data_size = {}".format(data_size))

In [None]:
# set configuration parameters
np.random.seed(seed=configs["data_processing"]["random_seed"])
active_sequential = configs["data_processing"]["sequential_splicer"]["active"]
train_ratio = 0.85
val_ratio = 0.1
test_ratio = 0.05
window_witdh = configs["data_processing"]["sequential_splicer"]["window_width"]
train_size_factor = 3

print("active_sequential = {}".format(active_sequential))

# split data based on random sequential chunks
if active_sequential:
    # set indices for training set
    splicer = ((timestamp - timestamp[0]) // pd.Timedelta(window_witdh)).values
    num_chunks = splicer[-1]
    num_train_chunks = (train_ratio * num_chunks) - (
        (train_ratio * num_chunks) % train_size_factor
    )
    if num_train_chunks == 0:
        raise Exception(
            "Total number of data chunks is zero. train_size_factor value might be too "
            "large compared to the data size. Exiting.."
        )
    msk = np.zeros(timestamp.shape[0]) + 2
    train_chunks = np.random.choice(
        np.arange(num_chunks), replace=False, size=int(num_train_chunks)
    )
    for chunk in train_chunks:
        indices = np.where(splicer == chunk)
        msk[indices] = 0

    # set indices for validation and test set
    remaining_chunks = np.setdiff1d(np.arange(num_chunks), train_chunks)
    if test_ratio == 0:
        msk[msk != 0] = 1
    else:
        num_val_chunks = int(
            (val_ratio / (1 - train_ratio)) * remaining_chunks.shape[0]
        )
        val_chunks = np.random.choice(
            remaining_chunks, replace=False, size=num_val_chunks
        )
        for chunk in val_chunks:
            indices = np.where(splicer == chunk)
            msk[indices] = 1

# split data based on random timestamp sampling
else:
    # set indices for training set
    num_ones = (train_ratio * data_size) - (
        (train_ratio * data_size) % train_size_factor
    )
    msk = np.zeros(data_size) + 2
    indices = np.random.choice(
        np.arange(data_size), replace=False, size=int(num_ones)
    )
    msk[indices] = 0

    # set indices for validation and test set
    remaining_indices = np.where(msk != 0)[0]
    if test_ratio == 0:
        msk[remaining_indices] = 1
    else:
        num_val = int((val_ratio / (1 - train_ratio)) * remaining_indices.shape[0])
        val_indices = np.random.choice(
            remaining_indices, replace=False, size=num_val
        )
        msk[val_indices] = 1

In [None]:
msk

In [None]:
train_ratio

In [None]:
data_size

In [None]:
train_ratio * data_size

In [None]:
(train_ratio * data_size) % train_size_factor

In [None]:
train_size_factor

In [None]:
(train_ratio * data_size) - (train_ratio * data_size) % train_size_factor