In [62]:
import os
import math
import shutil
import pandas as pd
import numpy as np
from workalendar.europe import Netherlands
from azureml.core import Workspace, Dataset

In [8]:
WORKDIR = os.path.dirname(os.getcwd())

In [2]:
def add_fourier_features(df, column_name, period, n, period_name = "f"):
    t = df[column_name]
    for i in range(n):
        j = math.ceil((i+1)/2)
        if i%2:
            df[f'{period_name}_{i}'] = np.cos(j * 2 * np.pi * t / period)
        else:
            df[f'{period_name}_{i}'] = np.sin(j * 2 * np.pi * t / period)
    return df

In [3]:
def create_holiday_features(df):
    # holiday indicator feature
    cal = Netherlands(include_carnival=True)

    # Make a pandas series with holidays of interest
    holidates = cal.holidays(2020) + cal.holidays(2021)
    pd_holidays = pd.to_datetime([d[0] for d in holidates])

    df['is_holiday'] = pd.to_datetime(df['data_index_'].dt.date).isin(pd_holidays)
    return df

In [4]:
def create_workday_weekend_features(df, fourier_order):
    # split features in workday / weekend
    df['is_workday'] = (~(df.is_holiday.astype(bool) | (df.day_of_week == 5) | (df.day_of_week == 6)))
    workday_data = {
        f'workday_{k}':df[k]*df.is_workday.astype(int)
        for k
        in ['temperature', 'solar_ghi'] + [f'f_quarter_{f}' for f in range(fourier_order)]
    }
    weekend_data = {
        f'weekend_{k}':df[k]*(~df.is_workday).astype(int)
        for k
        in ['temperature', 'solar_ghi'] + [f'f_quarter_{f}' for f in range(fourier_order)]
    }
    return workday_data, weekend_data

In [5]:
# get the workspace from config.json
ws = Workspace.from_config()
# get the datastore to upload our data
datastore = ws.get_default_datastore()

## Generate real-time data

In [6]:
# retrieve real-time dataset from Azure Datastore
ds = Dataset.get_by_name(ws, name="energy_data_realtime")
df = ds.to_pandas_dataframe()
# pick up only last two hours
X = df.iloc[-10:]
X

Unnamed: 0,data_index_,solar_prediction_mw,wind_prediction_mw,load_actuals_mw
50487,2021-06-09 21:45:00,0.0,36.29834,96.104623
50488,2021-06-09 22:00:00,0.0,52.180125,94.768896
50489,2021-06-09 22:15:00,0.0,52.216332,92.284519
50490,2021-06-09 22:30:00,0.0,52.100467,90.824687
50491,2021-06-09 22:45:00,0.0,51.901324,90.218876
50492,2021-06-09 23:00:00,0.0,51.823736,90.339627
50493,2021-06-09 23:15:00,0.0,50.666638,88.413236
50494,2021-06-09 23:30:00,0.0,49.584024,87.441277
50495,2021-06-09 23:45:00,0.0,48.517445,85.329757
50496,2021-06-10 00:00:00,0.0,47.509316,84.122788


In [9]:
REALTIME_FOLDER = os.path.join(WORKDIR, 'realtime-data')
os.makedirs(REALTIME_FOLDER, exist_ok=True)

In [10]:
# Save each sample as a separate file
header = X.columns.tolist()

for index, row in X.iterrows():
    fname = str(index+1) + '.csv'
    f = open(os.path.join(REALTIME_FOLDER, fname), 'w')
    f.write(','.join(str(c) for c in header))
    f.write('\n')
    f.write(','.join(str(v) for v in row.values))
    f.close()

In [12]:
datastore.upload(src_dir=REALTIME_FOLDER, target_path="realtime-data", overwrite=True, show_progress=True)

Uploading an estimated of 10 files
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50488.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50488.csv, 1 files out of an estimated total of 10
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50489.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50489.csv, 2 files out of an estimated total of 10
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50490.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50490.csv, 3 files out of an estimated total of 10
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50491.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50491.csv, 4 files out of an estimated total of 10
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50492.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\realtime-data\50492.csv, 5 fil

$AZUREML_DATAREFERENCE_689f07c3fd3f47f6bc0baf18babc59a8

In [13]:
# Register a dataset for the input data
realtime_data_set = Dataset.File.from_files(path=(datastore, 'realtime-data/'), validate=False)
try:
    realtime_data_set = realtime_data_set.register(workspace=ws, 
                                             name='realtime-data',
                                             description='realtime data for pytown demand energy forecast',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

Done!


## Generate batch data

In [92]:
# retrieve batch dataset from Azure Datastore
ds = Dataset.get_by_name(ws, name="energy_data_15_min")
df = ds.to_pandas_dataframe()
# pick up 1 week (672 data points)
X = df.iloc[-672:]
X

Unnamed: 0,data_index_,temperature,solar_ghi,solar_prediction_mw,wind_prediction_mw,load_actuals_mw
49825,2021-06-03 00:15:00,290.465912,0.0,0.0,53.265842,87.947544
49826,2021-06-03 00:30:00,290.422150,0.0,0.0,53.888616,87.836423
49827,2021-06-03 00:45:00,290.378387,0.0,0.0,54.780881,86.913803
49828,2021-06-03 01:00:00,290.325928,0.0,0.0,56.868520,86.637136
49829,2021-06-03 01:15:00,290.264740,0.0,0.0,57.985790,86.708764
...,...,...,...,...,...,...
50492,2021-06-09 23:00:00,288.254852,0.0,0.0,51.823736,90.339627
50493,2021-06-09 23:15:00,288.050537,0.0,0.0,50.666638,88.413236
50494,2021-06-09 23:30:00,287.846252,0.0,0.0,49.584024,87.441277
50495,2021-06-09 23:45:00,287.641937,0.0,0.0,48.517445,85.329757


In [93]:
# drop target and add features
X = X.drop(['load_actuals_mw'], axis=1)
# generate additional ML features
X['quarter_of_day'] = (X.data_index_.dt.hour // 6)
X['day_of_week'] = X.data_index_.dt.dayofweek
X = create_holiday_features(X)

# add Fourier features to capture daily pattern in model
fourier_order = 6

X = add_fourier_features(X, "quarter_of_day", 4 * 24, fourier_order, "f_quarter")

# split workdays and weekend/holidays
workday_data, weekend_data = create_workday_weekend_features(X, fourier_order)
X_linregr = pd.DataFrame(
    {**workday_data, **weekend_data}
)

In [94]:
# List the input feature columns
feat_columns = list(workday_data.keys()) + list(weekend_data.keys())

In [100]:
BATCH_FOLDER = os.path.join(WORKDIR, 'batch-data')
# Remove the local batch folder if left over from a previous run
if os.path.exists(BATCH_FOLDER) and os.path.isdir(BATCH_FOLDER):
    shutil.rmtree(BATCH_FOLDER, ignore_errors=True)
os.makedirs(BATCH_FOLDER, exist_ok=True)

In [101]:
# Save each sample as a separate "daily" batch (96 data points)
header = ['data_index_'] + X_linregr[feat_columns].columns.tolist()
X_feat = X_linregr[feat_columns].to_numpy()
batch_size = 96
for j in range(X_feat.shape[0] // batch_size):
    fname = 'batch-' + str(j+1) + '.csv'
    with open(os.path.join(BATCH_FOLDER, fname), "a") as f: 
        f.write(','.join(str(c) for c in header) + '\n')
        for i in range(j * batch_size, (j+1) * batch_size):
            f.write(str(X.iloc[i].data_index_) + ',')
            f.write(','.join(str(v) for v in X_feat[i].tolist()))
            if i != (j+1) * batch_size - 1:
                f.write('\n')

In [102]:
datastore.upload(src_dir=BATCH_FOLDER, target_path="batch-data", overwrite=True, show_progress=True)

Uploading an estimated of 7 files
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-2.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-2.csv, 1 files out of an estimated total of 7
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-4.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-4.csv, 2 files out of an estimated total of 7
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-6.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-6.csv, 3 files out of an estimated total of 7
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-1.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-1.csv, 4 files out of an estimated total of 7
Uploading d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-3.csv
Uploaded d:\DS\MLOps-Bootcamp-2021\MLOps-Bootcamp-2021\batch-data\batch-3.csv, 5 files out of an es

$AZUREML_DATAREFERENCE_0346869f9b7541589ed50bf44eaffa39

In [103]:
# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(datastore, 'batch-data/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='batch-data',
                                             description='batch data for pytown demand energy forecast',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

Done!
