# Predicting future energy usage from multiple dependent time series (v2)

## Part 1: Load and examine the data

In [None]:
data_bucket = 'doughudgeon-mlforbusiness' # change the name odf your bucket
subfolder = 'ch06'

In [None]:
%matplotlib inline

import sys
from dateutil.parser import parse
import json
from random import shuffle
import random
import datetime
import os

import pandas as pd                               
import boto3
import s3fs
import sagemaker
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# correspond to Version 2.x of the SageMaker Python SDK
# Check the latest version of SageMaker
if int(sagemaker.__version__.split('.')[0]) == 2:
    print("Version is good")
else:
    !{sys.executable} -m pip install --upgrade sagemaker
    print("Installing latest SageMaker Version. Please restart the kernel")
    
role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [None]:
s3_data_path = f"s3://{data_bucket}/{subfolder}/data"
s3_output_path = f"s3://{data_bucket}/{subfolder}/output"
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/meter_data.csv', index_col=0)
df.head()

In [None]:
print(f'Number of rows in dataset: {df.shape[0]}')
print(f'Number of columns in dataset: {df.shape[1]}')

## Part 2 : Get the data in the right shape

In [None]:
df.index = pd.to_datetime(df.index)
daily_df = df.resample('D').sum()
daily_df.head()

In [None]:
print(daily_df.shape)
print(f'Time series starts at {daily_df.index[0]} \
and ends at {daily_df.index[-1]}')

In [None]:
daily_df = daily_df.fillna(daily_df.shift(7))
daily_df

In [None]:
print('Number of time series:',daily_df.shape[1])
fig, axs = plt.subplots(5, 2, figsize=(20, 20), sharex=True)
axx = axs.ravel()
indices = [0,1,2,3,4,5,40,41,42,43]
for i in indices:
    plot_num = indices.index(i)
    daily_df[daily_df.columns[i]].loc["2017-11-01":"2018-01-31"].plot(ax=axx[plot_num])
    axx[plot_num].set_xlabel("date")    
    axx[plot_num].set_ylabel("kW consumption")

Visually there are some noticeable correlations which DeepAR will likely recognise and use!

## Part 3: Create Train and Test Datasets

In [None]:
daily_power_consumption_per_site = []
for column in daily_df.columns:
    site_consumption = np.trim_zeros(daily_df[column], trim='f')
    site_consumption = site_consumption.fillna(0)
    daily_power_consumption_per_site.append(site_consumption)
    
print(f'Time series covers {len(daily_power_consumption_per_site[0])} days.')
print(f'Time series starts at {daily_power_consumption_per_site[0].index[0]}')
print(f'Time series ends at {daily_power_consumption_per_site[0].index[-1]}') 

In [None]:
freq = 'D'
prediction_length = 30

from datetime import timedelta

start_date = pd.Timestamp("2017-11-01 00:00:00", freq=freq)
end_training = start_date + datetime.timedelta(364)
end_testing = end_training + datetime.timedelta(prediction_length)

print(f'End training: {end_training}, End testing: {end_testing}')

In [None]:
training_data = [
    {
        "start": str(start_date),
        "target": ts[start_date:end_training].tolist()
    }
    for ts in daily_power_consumption_per_site
]

test_data = [
    {
        "start": str(start_date),
        "target": ts[start_date:end_testing].tolist()
    }
    for ts in daily_power_consumption_per_site
]

In [None]:
def write_dicts_to_s3(path, data):
    with s3.open(path, 'wb') as f:
        for d in data:
            f.write(json.dumps(d).encode("utf-8"))
            f.write("\n".encode('utf-8'))
            
write_dicts_to_s3(f'{s3_data_path}/train/train.json', training_data)
write_dicts_to_s3(f'{s3_data_path}/test/test.json', test_data)

## Part 4: Train the Model

In [None]:
s3_output_path = f's3://{data_bucket}/{subfolder}/output'
sess = sagemaker.Session()
image_name = sagemaker.image_uris.retrieve("forecasting-deepar", sess.boto_region_name, "latest")

In [None]:
estimator = sagemaker.estimator.Estimator(
    sagemaker_session=sess,
    image_uri=image_name,
    role=role,
    instance_count=1,
    instance_type='ml.c5.2xlarge', # $0.476 per hour as of Jan 2019.
    base_job_name='ch6-energy-usage',
    output_path=s3_output_path
)

In [None]:
estimator.set_hyperparameters(
    time_freq=freq,
    epochs="400",
    early_stopping_patience="40",
    mini_batch_size="64",
    learning_rate="5E-4",
    context_length="90",
    prediction_length=str(prediction_length)
)

In [None]:
%%time
data_channels = {
    "train": "{}/train/".format(s3_data_path),
    "test": "{}/test/".format(s3_data_path)
}
estimator.fit(inputs=data_channels, wait=True)

## Part 5: Host the model

In [None]:
endpoint_name = 'energy-usage'

try:
    sess.delete_endpoint(
        sagemaker.predictor.Predictor(endpoint=endpoint_name).endpoint, delete_endpoint_config=True)
    print('Warning: Existing endpoint and configuration deleted to make way for your new endpoint.')
    from time import sleep
    sleep(30)
except:
    pass

In [None]:
class DeepARPredictor(sagemaker.predictor.Predictor):
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, 
                         #serializer=JSONSerializer(),
                         serializer=IdentitySerializer(content_type="application/json"),
                         **kwargs)
        
    def predict(self, ts, cat=None, dynamic_feat=None, 
                num_samples=100, return_samples=False, quantiles=["0.1", "0.5", "0.9"]):
        """Requests the prediction of for the time series listed in `ts`, each with the (optional)
        corresponding category listed in `cat`.
        
        ts -- `pandas.Series` object, the time series to predict
        cat -- integer, the group associated to the time series (default: None)
        num_samples -- integer, number of samples to compute at prediction time (default: 100)
        return_samples -- boolean indicating whether to include samples in the response (default: False)
        quantiles -- list of strings specifying the quantiles to compute (default: ["0.1", "0.5", "0.9"])
        
        Return value: list of `pandas.DataFrame` objects, each containing the predictions
        """
        prediction_time = ts.index[-1] + ts.index.freq
        quantiles = [str(q) for q in quantiles]
        req = self.__encode_request(ts, cat, dynamic_feat, num_samples, return_samples, quantiles)
        res = super(DeepARPredictor, self).predict(req)
        return self.__decode_response(res, ts.index.freq, prediction_time, return_samples)
    
    def __encode_request(self, ts, cat, dynamic_feat, num_samples, return_samples, quantiles):
        instance = series_to_dict(ts, cat if cat is not None else None, dynamic_feat if dynamic_feat else None)
        
        configuration = {
            "num_samples": num_samples,
            "output_types": ["quantiles", "samples"] if return_samples else ["quantiles"],
            "quantiles": quantiles
        }
        
        http_request_data = {
            "instances": [instance],
            "configuration": configuration
        }
        
        return json.dumps(http_request_data).encode('utf-8')
    
    def __decode_response(self, response, freq, prediction_time, return_samples):
        # we only sent one time series so we only receive one in return
        # however, if possible one will pass multiple time series as predictions will then be faster
        predictions = json.loads(response.decode('utf-8'))['predictions'][0]
        prediction_length = len(next(iter(predictions['quantiles'].values())))
        prediction_index = pd.date_range(start=prediction_time, freq=freq, periods=prediction_length)
        if return_samples:
            dict_of_samples = {'sample_' + str(i): s for i, s in enumerate(predictions['samples'])}
        else:
            dict_of_samples = {}
        return pd.DataFrame(data={**predictions['quantiles'], **dict_of_samples}, index=prediction_index)

    def set_frequency(self, freq):
        self.freq = freq
        
def encode_target(ts):
    return [x if np.isfinite(x) else "NaN" for x in ts]        

def series_to_dict(ts, cat=None, dynamic_feat=None):
    """Given a pandas.Series object, returns a dictionary encoding the time series.

    ts -- a pands.Series object with the target time series
    cat -- an integer indicating the time series category

    Return value: a dictionary
    """
    obj = {"start": str(ts.index[0]), "target": encode_target(ts)}
    if cat is not None:
        obj["cat"] = cat
    if dynamic_feat is not None:
        obj["dynamic_feat"] = dynamic_feat        
    return obj

Now we can deploy the model and create an endpoint that can be queried using our custom DeepARPredictor class.

In [None]:
%%time
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
#from sagemaker.serializers import CSVSerializer
from sagemaker.serializers import CSVSerializer, IdentitySerializer

predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    predictor_cls=DeepARPredictor,
    #content_type='application/json',
    #serializer=IdentitySerializer(content_type="application/json"),
    #serializer=JSONSerializer(),
    #deserializer=JSONDeserializer(),
    endpoint_name=endpoint_name)

## Part 6: Make Predictions and Plot Results 

In [None]:
# predictor.predict(ts=daily_power_consumption_per_site[0][start_date+30:end_training], quantiles=[0.1, 0.5, 0.9]).head()
# predictor.predict(ts=daily_power_consumption_per_site[0][start_date+datetime.timedelta(30)*start_date.freq:end_training],quantiles=[0.1, 0.5, 0.9]).head()
predictor.predict(ts=daily_power_consumption_per_site[0][start_date+datetime.timedelta(30):end_training],quantiles=[0.1, 0.5, 0.9]).head()

In [None]:
def plot(
    predictor, 
    target_ts,
    end_training=end_training, 
    plot_weeks=12,
    confidence=80
):
    print(f"Calling served model to generate predictions starting from {end_training} to {end_training+datetime.timedelta(prediction_length)}")
    low_quantile = 0.5 - confidence * 0.005
    up_quantile = confidence * 0.005 + 0.5
        
    plot_history = plot_weeks * 7

    fig = plt.figure(figsize=(20, 3))
    ax = plt.subplot(1,1,1)
    
    prediction = predictor.predict(ts=target_ts[:end_training], quantiles=[low_quantile, 0.5, up_quantile])
                
    target_section = target_ts[end_training-datetime.timedelta(plot_history):end_training+datetime.timedelta(prediction_length)]
    target_section.plot(color="black", label='Actual')
    
    ax.fill_between(
        prediction[str(low_quantile)].index, 
        prediction[str(low_quantile)].values, 
        prediction[str(up_quantile)].values, 
        color="b", alpha=0.3, label='{}% confidence interval'.format(confidence)
    )
#     prediction["0.5"].plot(color="b", label='P50')
    ax.legend(loc=2)    
    
    ax.set_ylim(target_section.min() * 0.5, target_section.max() * 1.5)

In [None]:
site_id = 33
plot_weeks = 0
confidence = 80
plot(
        predictor,
        target_ts=daily_power_consumption_per_site[site_id][start_date+datetime.timedelta(30):],
        plot_weeks=plot_weeks,
        confidence=confidence
    )

## Calculate some objective statistics regarding accuracy of our model

RMSE measures the "root mean square" error. It penalises more extreme "misses" and rewards consistency.
It also has the advantage that it's magnititude is proportional to the value being predicted.
MAPE measures "Mean Absolute Percentage Error". The main reason to use MAPE is that it scores errors in
percentage terms rather than as absolutes. Hence a prediciton of 11 for a value of 10 is treated identically
to a prediction of 90 for a value of 100.

In [None]:
# Gather 30 day predictions for all timeseries
predictions= []
for i, ts in enumerate(daily_power_consumption_per_site):

    # call the end point to get the 30 day prediction
    predictions.append(predictor.predict(ts=ts[start_date+datetime.timedelta(30):end_training])['0.5'].sum())

usages = [ts[end_training+datetime.timedelta(1):end_training+datetime.timedelta(30)].sum() for ts in daily_power_consumption_per_site]

for p,u in zip(predictions,usages):
    print(f'Predicted {p} kwh but usage was {u} kwh,')

In [None]:
def mape(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
print(f'MAPE: {round(mape(usages, predictions),1)}%')

## Remove the Endpoint (recommended)

Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"

In [None]:
# Remove the Endpoint (optional)
# Comment out this cell to remove the endpoint if you want the endpoint to exist after "run all"
sagemaker.Session().delete_endpoint(endpoint_name)