MIT License

Copyright (c) Microsoft Corporation. All rights reserved.

This notebook is adapted from Microsoft Learning mslearn-dp100 

Copyright (c) 2021 PyLadies Amsterdam, Alyona Galyeva

# Generate batch data

In [2]:
import os
import pandas as pd
import numpy as np
from azureml.core import Workspace, Dataset

In [3]:
WORKDIR = os.getcwd()

In [4]:
# Create a folder
BATCH_FOLDER = 'batch-data'
os.makedirs(BATCH_FOLDER, exist_ok=True)
print("Folder created!")

Folder created!


In [6]:
# get the workspace from config.json
ws = Workspace.from_config()
# get the datastore to upload our data
datastore = ws.get_default_datastore()

In [7]:
# retrieve test dataset from Azure Datastore
test_ds = Dataset.get_by_name(ws, name="test_nyc_demand_data")
test = test_ds.to_pandas_dataframe()
test.head()

Unnamed: 0,timeStamp,demand,precip,temp,hour,month,dayofweek,temp_lag1,temp_lag2,temp_lag3,temp_lag4,temp_lag5,temp_lag6,demand_lag1,demand_lag2,demand_lag3,demand_lag4,demand_lag5,demand_lag6
0,2016-07-01 00:00:00,6444.75,0.0,74.63,0,6,4,75.1,75.72,76.72,75.85,77.36,80.92,6912.7,7332.625,7576.558,7603.008,7788.292,8102.142
1,2016-07-01 01:00:00,6083.075,0.0,73.77,1,6,4,74.63,75.1,75.72,76.72,75.85,77.36,6444.75,6912.7,7332.625,7576.558,7603.008,7788.292
2,2016-07-01 02:00:00,5795.317,0.0,72.32,2,6,4,73.77,74.63,75.1,75.72,76.72,75.85,6083.075,6444.75,6912.7,7332.625,7576.558,7603.008
3,2016-07-01 03:00:00,5610.942,0.0424,71.29,3,6,4,72.32,73.77,74.63,75.1,75.72,76.72,5795.317,6083.075,6444.75,6912.7,7332.625,7576.558
4,2016-07-01 04:00:00,5555.767,0.033,71.04,4,6,4,71.29,72.32,73.77,74.63,75.1,75.72,5610.942,5795.317,6083.075,6444.75,6912.7,7332.625


In [8]:
# pick up 1 week from Monday 31-07-2017 to Sunday 06-08-2017, drop target and timestamp, convert to numpy arrow
X = test[9480:9648]
X = X.drop(['demand', 'timeStamp'], axis=1)
X = X.to_numpy()

In [16]:
X

array([[0.000000e+00, 6.811000e+01, 0.000000e+00, ..., 6.443792e+03,
        6.478575e+03, 6.558408e+03],
       [0.000000e+00, 6.724000e+01, 1.000000e+00, ..., 6.481817e+03,
        6.443792e+03, 6.478575e+03],
       [0.000000e+00, 6.637000e+01, 2.000000e+00, ..., 6.296442e+03,
        6.481817e+03, 6.443792e+03],
       ...,
       [0.000000e+00, 7.064000e+01, 2.100000e+01, ..., 6.186025e+03,
        6.193600e+03, 6.120008e+03],
       [0.000000e+00, 6.927000e+01, 2.200000e+01, ..., 6.154817e+03,
        6.186025e+03, 6.193600e+03],
       [0.000000e+00, 6.835000e+01, 2.300000e+01, ..., 6.118825e+03,
        6.154817e+03, 6.186025e+03]])

In [18]:
# Save each sample as a separate file
print("Saving files...")
for i in range(len(X)):
    fname = str(i+1) + '.csv'
    X[i].tofile(os.path.join(BATCH_FOLDER, fname), sep=",")
print("files saved!")

Saving files...
files saved!


In [19]:
datastore.upload(src_dir="batch-data", target_path="batch-data", overwrite=True, show_progress=True)

Uploading an estimated of 168 files
Uploading batch-data\1.csv
Uploaded batch-data\1.csv, 1 files out of an estimated total of 168
Uploading batch-data\10.csv
Uploaded batch-data\10.csv, 2 files out of an estimated total of 168
Uploading batch-data\100.csv
Uploaded batch-data\100.csv, 3 files out of an estimated total of 168
Uploading batch-data\101.csv
Uploaded batch-data\101.csv, 4 files out of an estimated total of 168
Uploading batch-data\102.csv
Uploaded batch-data\102.csv, 5 files out of an estimated total of 168
Uploading batch-data\103.csv
Uploaded batch-data\103.csv, 6 files out of an estimated total of 168
Uploading batch-data\104.csv
Uploaded batch-data\104.csv, 7 files out of an estimated total of 168
Uploading batch-data\105.csv
Uploaded batch-data\105.csv, 8 files out of an estimated total of 168
Uploading batch-data\106.csv
Uploaded batch-data\106.csv, 9 files out of an estimated total of 168
Uploading batch-data\107.csv
Uploaded batch-data\107.csv, 10 files out of an es

$AZUREML_DATAREFERENCE_6e9c11778e154738800ace2e25e0d8f5

In [20]:
# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(datastore, 'batch-data/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='batch-data',
                                             description='batch data for nyc demand energy forecast',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

Done!
