In [1]:
import os
import math
import pandas as pd
import numpy as np
from azureml.core import Workspace, Dataset

In [2]:
def add_fourier_features(df, column_name, period, n, period_name = "f"):
    t = df[column_name]
    for i in range(n):
        j = math.ceil((i+1)/2)
        if i%2:
            df[f'{period_name}_{i}'] = np.cos(j * 2 * np.pi * t / period)
        else:
            df[f'{period_name}_{i}'] = np.sin(j * 2 * np.pi * t / period)
    return df

In [3]:
def create_workday_weekend_features(df, fourier_order):
    # split features in workday / weekend
    df['is_workday'] = (~(df.is_holiday.astype(bool) | (df.day_of_week == 5) | (df.day_of_week == 6)))
    workday_data = {
        f'workday_{k}':df[k]*df.is_workday.astype(int)
        for k
        in ['temperature', 'solar_ghi'] + [f'f_quarter_{f}' for f in range(fourier_order)]
    }
    weekend_data = {
        f'weekend_{k}':df[k]*(~df.is_workday).astype(int)
        for k
        in ['temperature', 'solar_ghi'] + [f'f_quarter_{f}' for f in range(fourier_order)]
    }
    return workday_data, weekend_data

In [4]:
WORKDIR = os.getcwd()

In [5]:
# Create a folder
BATCH_FOLDER = 'batch-data'
os.makedirs(BATCH_FOLDER, exist_ok=True)
print("Folder created!")

Folder created!


In [6]:
# get the workspace from config.json
ws = Workspace.from_config()
# get the datastore to upload our data
datastore = ws.get_default_datastore()

In [7]:
# retrieve test dataset from Azure Datastore
ds = Dataset.get_by_name(ws, name="energy_data_15_min")
df = ds.to_pandas_dataframe()
df.head()

Unnamed: 0,data_index_,temperature,solar_ghi,solar_prediction_mw,wind_prediction_mw,load_actuals_mw
0,2020-01-01 00:00:00,274.989655,0.0,0.0,70.865426,95.756328
1,2020-01-01 00:15:00,274.925659,0.0,0.0,69.296785,94.836196
2,2020-01-01 00:30:00,274.861694,0.0,0.0,66.977409,93.798127
3,2020-01-01 00:45:00,274.797699,0.0,0.0,64.305715,92.162902
4,2020-01-01 01:00:00,274.423157,0.0,0.0,61.128262,91.50667


In [8]:
# pick up last week, drop target and convert to numpy array
X = df.iloc[-7:]
X = X.drop(['load_actuals_mw'], axis=1)
# generate additional ML features
X['day_of_week'] = X['data_index_'].dt.dayofweek
X['quarter_of_day'] =  np.where(X['data_index_'].dt.hour > 0, X['data_index_'].dt.hour.apply(lambda x: math.ceil(x / 6.)), 4)

# holiday indicator feature
from workalendar.europe import Netherlands
cal = Netherlands(include_carnival=True)

# Make a pandas series with holidays of interest
holidates = cal.holidays(2020) + cal.holidays(2021)
pd_holidays = pd.to_datetime([d[0] for d in holidates])

X['is_holiday'] = pd.to_datetime(X['data_index_'].dt.date).isin(pd_holidays)

# add Fourier features to capture daily pattern in model
fourier_order = 6

X = add_fourier_features(X, "quarter_of_day", 4 * 24, fourier_order, "f_quarter")

# split workdays and weekend/holidays
workday_data, weekend_data = create_workday_weekend_features(X, fourier_order)
X_linregr = pd.DataFrame(
    {**workday_data, **weekend_data}
)

In [9]:
# List the input feature columns
feat_columns = list(workday_data.keys()) + list(weekend_data.keys())

In [10]:
X = X_linregr[feat_columns].to_numpy()

In [11]:
# Save each sample as a separate file
print("Saving files...")
for i in range(len(X)):
    fname = str(i+1) + '.csv'
    X[i].tofile(os.path.join(BATCH_FOLDER, fname), sep=",")
print("files saved!")

Saving files...
files saved!


In [12]:
datastore.upload(src_dir="batch-data", target_path="batch-data", overwrite=True, show_progress=True)

Uploading an estimated of 7 files
Uploading batch-data\1.csv
Uploaded batch-data\1.csv, 1 files out of an estimated total of 7
Uploading batch-data\2.csv
Uploaded batch-data\2.csv, 2 files out of an estimated total of 7
Uploading batch-data\3.csv
Uploaded batch-data\3.csv, 3 files out of an estimated total of 7
Uploading batch-data\4.csv
Uploaded batch-data\4.csv, 4 files out of an estimated total of 7
Uploading batch-data\5.csv
Uploaded batch-data\5.csv, 5 files out of an estimated total of 7
Uploading batch-data\6.csv
Uploaded batch-data\6.csv, 6 files out of an estimated total of 7
Uploading batch-data\7.csv
Uploaded batch-data\7.csv, 7 files out of an estimated total of 7
Uploaded 7 files


$AZUREML_DATAREFERENCE_f0ee28efb8c54513b8a85695db09d576

In [13]:
# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(datastore, 'batch-data/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='batch-data',
                                             description='batch data for pytown demand energy forecast',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

Done!
