In [1]:
import os
import math
import random
import numpy as np
import pandas as pd

from household import Household


random.seed(30)
np.random.seed(30)

# 0 Introduction
**0.1 Goal**
* 1 step-ahead electricity consumption prediction
* single house
* consider weahter and calendar effects

**0.2 Data**
* random household in London
* January 2012 to December 2013
* 30 min resolution
* hourly temperature records
* date and time information

**0.3 Recap of Visualization and Data Analya**
* Plots of Holiday vs. weekday vs. weekend: widely varrying patterns
* Time-series plots: data has sharp peaks
* Box plots vs. time and temperature: the occurance of these peaks has no visible pattern, and are diagnosed as outliers 
* Auto-correlation and partial auto-correlation: auto-regressors with lags in 48k+/-3, k between 0 to 7 are informative
* seasonal differences cannot be easily detected


**0.4 Summary**
* Need different models for weekday/weekend/holiday
* Predicting the peaks can be challenging or impossible
* AR models with lagged versions of the time-series as features are suitable for the task

**0.5 In This Notebook**
* One-step-ahead consumption prediction of a fixed household
* Feature selection problem: determining suitable auto-regressors 
* Evaluating fit results
* Studying the effect of day part (morning/noon/...) on prediction error

**0.6 General Setup**
* Single fixed household
* Only Thursdays that were not a holiday (see section 1)
* Dependent variable (target): one-step-ahead consumption
* Regressors (features): constant / hour of day (x,y) / day of year (x,y) / temperature / auto-regressors(delayed versions)
* The same model is used for all parts of Tuesdays (morning/noon/...) and all seasons

**Setup is the same in all sections unless specified**
**  **

# 1 Preparing The Data for Regression

**1.1 Cleaning**
* interpolate missing values
* merge consumption, weather, and calendar data to a single dataframe
* no outlier removal method 
* convert time of day and day of year to x,y
* min-max scaling to [-1, 1]

**1.2 Filtering the data**
* fix one day, e.g. Thursday
* remove all holidays

**1.2 Constructing regression dataframe**
* dependent variable: consumption at time interval t
* regressors: temperature, time of day (x,y), day of year (x,y), auto-regressors(delayed versions of the signal)
* dealing with holidays: if holidays appear among auto-regressors, replace them with the previous day. If the previous day is weekend, holiday, or already in the features, keep going 1 day back.

**1.3 Describing the options**
* filt_days: the day of week for which we are training a model
* remove_holiday, replacement_method: if public holidays should be removed, b.c. they are different from normal days, and how to replace them (default: True, week_before)
* feat_cols: which temporal features are used in the regression dataframe
* dayparts: can select different parts of day to fit a specific model for them, e.g. only for nights (default is [] for selecting the whole day)
* lags: autoregressive features to be used
* step_ahead: prediction horizon, e.g. if step_ahead=1 predicts the next time step (30 min) using data up to now, if step_ahead=2 predicts 2 steps into future (1 hour) using data up to now, ...

In [2]:
# set options
step_ahead=1

# candidate lags
lags = []
for i in np.arange(8):
    lags = lags + [48*i+1,48*i+2,48*i+3,48*i+48]
lags = [x-step_ahead+1 for x in lags if x>=step_ahead]

# regression options
options = {"filt_days":['Tuesday'], 
           "remove_holiday":True,
           "feat_cols":['hourofd_x', 'hourofd_y', 'dayofy_x', 'dayofy_y', 'temperature_hourly'],
           "replacement_method":'week_before',
           "dayparts":[]}

In [3]:
# construct household obj
block=77
house_id = "MAC000068"
household = Household(house_id=house_id, block_num=block)
# load data with regression options
household.construct_dataset(lags=lags, step_ahead=step_ahead, options=options)

number of holidays
25.0

   ***   DATA IS READY FOR USE   ***



In [4]:
# print information about this household
path = os.getcwd()+"/input/informations_households.csv.xls"
data = pd.read_csv(path)
info = data.loc[data.LCLid==household.house_id]
print(info)

          LCLid stdorToU    Acorn Acorn_grouped      file
3871  MAC000068      Std  ACORN-L     Adversity  block_77


# 2 Fit Linear Regression Model

In [5]:
# set training data
household.train_test_split(test_size=0.25)
# LR
method='Adam'
household.fit_personal_model(method=method)
res = household.evaluate_model(method=method, 
                               measures=['MSE', 'MAE', 'R2', 'Adjr2', 'AIC'], 
                               verbose=True)

Mean squared error:  train 0.20, test 0.20
Mean absolute error: train 0.30, test 0.31
Coefficient of determination (R2): train 0.46, test 0.42
Adjusted coeff. of determination:  train 0.46, test 0.42
AIC: train 60.65, test 62.76


# 3 Comparing Prediction Accuracy for Different Horizons