<p>
    <img src="https://s3.amazonaws.com/iotanalytics-templates/Logo.png" style="float:left;">
    <h1 style="color:#1A5276;padding-left:115px;padding-bottom:0px;font-size:28px;">AWS IoT Analytics | Smart Building Energy Consumption</h1>
</p>
<p style="color:#1A5276;padding-left:90px;padding-top:0px;position:relative;font-style:italic;font-size:18px">
Application of Bench-marking Feature Engineering and AutoGluon to automate the model training for building energy consumption prediction.   
</p>

## Set-up: Import Required Notebook Libraries

In [1]:
#This notebook uses holidays package

try:
    import holidays
    import lightgbm as lgb
except:
    !pip install holidays
    import holidays
    !pip install lightgbm
    import lightgbm as lgb

In [4]:
import pandas as pd
import numpy as np
import boto3
import os
import sys
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc
from sklearn.metrics import mean_squared_error


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 

In [5]:
import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
warnings.simplefilter(action='ignore', category=FutureWarning)

<h1 style="color:#20B3CD;font-size:20px;float:left">Step 1  |  Load Data from IoTAnalytics</h1> <div style="float:right;height:7px;background-color:#20B3CD;margin-top:30px;width:70%"></div>

In [6]:
# Before actually loading the data we need to set up an IoT Analytics client for accessing datasets.
# create IoT Analytics client
client = boto3.client('iotanalytics')
use_sample_dataset = True
dataset = "jh_demo_batch_ml_dataset"

In [None]:
train_df = pd.read_csv("train.csv")

Now we can get the data location (URL) for the given dataset and start working with the data (In order to need to perform get_dataset_content, you need to grant iot analytics corresponding IAM permission):

In [7]:
# import target Data Set from AWS IoT Analytics service
try:
    dataset_url = client.get_dataset_content(datasetName = dataset)['entries'][0]['dataURI']
    train_df = pd.read_csv(dataset_url,parse_dates=True)
    if train_df.empty:
        raise Exception('No data found')
    
    # start working with the data
    drop_col = ['city','country','state','__dt']
    train_df.drop(drop_col, axis=1, inplace=True) # removes unnecessary columns
        
# use backup dataset if dataset not found
except:
    train_df = pd.read_csv('jh_demo_batch_ml_train_dataset.csv', parse_dates=True)
    drop_col = ['city','country','state']
    train_df.drop(drop_col, axis=1, inplace=True) # removes unnecessary columns

In [8]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,8,0,2016-10-21 03:00:00,363.805,0,Education,60809,2003.0,,21.7,0.0,18.3,0.0,1012.5,350.0,2.6
1,8,0,2016-10-31 00:00:00,404.417,0,Education,60809,2003.0,,23.9,6.0,17.8,0.0,1018.5,60.0,4.6
2,8,0,2016-11-10 09:00:00,500.317,0,Education,60809,2003.0,,16.7,2.0,15.6,0.0,1018.5,360.0,1.5
3,8,0,2016-11-14 01:00:00,495.88,0,Education,60809,2003.0,,21.7,,16.1,0.0,1017.5,120.0,3.1
4,8,0,2016-09-07 00:00:00,382.916,0,Education,60809,2003.0,,27.2,4.0,21.1,0.0,1020.5,60.0,4.6


In [9]:
len(train_df)

1010025

<h1 style="color:#20B3CD;font-size:20px;float:left">Step 2  |  Feature Engineering</h1> <div style="float:right;height:7px;background-color:#20B3CD;margin-top:30px;width:70%"></div>

## (1) Weathertransformer

#### Added missing time-series data by finding start_date-end_date
#### Then fill in missed data invweather data, temperature, cloud coverage, due_temperature, sea_level, wind_direction, wind_speed, precip_depth

In [7]:
from weathertranformer import WeatherTranformer

In [8]:
train_df = WeatherTranformer(True).fit_transform(train_df)

## (2) Smoothing Filter

#### Smooth air and dew temperature

In [9]:
from SGFilter import SGFilterTranformer

In [10]:
train_df = SGFilterTranformer(True).fit_transform(train_df)

In [11]:
train_df.head()

Unnamed: 0,site_id,building_id,meter,timestamp,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,...,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,air_smooth,dew_smooth,air_diff,dew_diff,air_diff2,dew_diff2
0,0,4.0,0.0,2016-03-27 02:00:00,0.0,Education,116607.0,1975.0,,22.8,...,-1.0,1021.5,80.0,2.6,21.065734,17.720979,0.403147,-1.767622,0.038462,0.43007
1,0,4.0,0.0,2016-04-12 00:00:00,0.0,Education,116607.0,1975.0,,21.7,...,0.0,1021.0,100.0,6.2,21.488112,16.168392,0.441608,-1.337552,0.038462,0.43007
2,0,4.0,0.0,2016-04-18 09:00:00,0.0,Education,116607.0,1975.0,,16.7,...,-1.0,1022.5,360.0,4.6,21.948951,15.045874,0.48007,-0.907483,0.038462,0.43007
3,0,4.0,0.0,2016-04-28 23:00:00,0.0,Education,116607.0,1975.0,,30.0,...,-1.0,1013.5,120.0,2.6,22.448252,14.353427,0.518531,-0.477413,0.038462,0.43007
4,0,4.0,0.0,2016-05-06 02:00:00,0.0,Education,116607.0,1975.0,,20.6,...,0.0,1009.0,280.0,4.1,22.986014,14.091049,0.556993,-0.047343,0.038462,0.43007


## (3) Rolling Window

#### Calculate min max std within a time window of 24

In [12]:
from Rollwindow import RollwinTranformer

In [13]:
train_df = RollwinTranformer(True,24).fit_transform(train_df)   

In [14]:
train_df.head()

Unnamed: 0,site_id,building_id,meter,timestamp,meter_reading,primary_use,square_feet,year_built,floor_count,air_temperature,...,sea_level_pressure_min_lag24,sea_level_pressure_std_lag24,wind_direction_mean_lag24,wind_direction_max_lag24,wind_direction_min_lag24,wind_direction_std_lag24,wind_speed_mean_lag24,wind_speed_max_lag24,wind_speed_min_lag24,wind_speed_std_lag24
0,0,4.0,0.0,2016-03-27 02:00:00,0.0,Education,116607.0,1975.0,,22.8,...,1021.5,,80.0,80.0,80.0,,2.599609,2.599609,2.599609,
1,0,4.0,0.0,2016-04-12 00:00:00,0.0,Education,116607.0,1975.0,,21.7,...,1021.0,0.353516,90.0,100.0,80.0,14.140625,4.398438,6.199219,2.599609,2.544922
2,0,4.0,0.0,2016-04-18 09:00:00,0.0,Education,116607.0,1975.0,,16.7,...,1021.0,0.763672,180.0,360.0,80.0,156.25,4.464844,6.199219,2.599609,1.803711
3,0,4.0,0.0,2016-04-28 23:00:00,0.0,Education,116607.0,1975.0,,30.0,...,1013.5,4.128906,165.0,360.0,80.0,131.0,4.0,6.199219,2.599609,1.743164
4,0,4.0,0.0,2016-05-06 02:00:00,0.0,Education,116607.0,1975.0,,20.6,...,1009.0,5.949219,188.0,360.0,80.0,124.5625,4.019531,6.199219,2.599609,1.510742


## (4) Numerical Features

#### Feature transform for Numerical Features

In [15]:
from NumericalEng import NumericalTransformer

In [16]:
train_df = NumericalTransformer(True, True, False, True, True, True, True).fit_transform(train_df)
train_df.head()

Unnamed: 0,site_id,building_id,meter,timestamp,meter_reading,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,...,wind_speed_mean_lag24,wind_speed_max_lag24,wind_speed_min_lag24,wind_speed_std_lag24,floor_area,hour,weekend,week,month,yr_built
0,2,180.0,1.0,2016-01-01,20.3274,Education,10.109973,15.6,6.0,-5.6,...,2.488281,6.199219,0.0,1.701172,,0,4,53,1,
1,2,257.0,0.0,2016-01-01,43.15,Education,11.233886,15.6,6.0,-5.6,...,5.070312,8.796875,1.5,2.314453,,0,4,53,1,
2,14,1302.0,0.0,2016-01-01,115.797,Office,11.412132,3.976946,1.291339,-3.134731,...,5.078125,11.296875,0.5,2.746094,,0,4,53,1,
3,2,190.0,0.0,2016-01-01,118.48,Education,11.828035,15.6,6.0,-5.6,...,3.091797,6.199219,0.0,1.69043,,0,4,53,1,63.0
4,13,1142.0,1.0,2016-01-01,0.0,Office,11.549441,-8.3,8.0,-12.2,...,4.167969,12.398438,0.5,2.511719,,0,4,53,1,


## (4) Holidays Features

### Add one feature to state if that day is public holiday or not


In [17]:
from HolidayFea import HolidayTranformer

In [18]:
train_df = HolidayTranformer(True).fit_transform(train_df)
train_df.head()

Unnamed: 0,site_id,building_id,meter,meter_reading,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,...,wind_speed_max_lag24,wind_speed_min_lag24,wind_speed_std_lag24,floor_area,hour,weekend,week,month,yr_built,IsHoliday
0,2,180.0,1.0,20.3274,Education,10.109973,15.6,6.0,-5.6,0.0,...,6.199219,0.0,1.701172,,0,4,53,1,,1
1,2,257.0,0.0,43.15,Education,11.233886,15.6,6.0,-5.6,0.0,...,8.796875,1.5,2.314453,,0,4,53,1,,1
2,14,1302.0,0.0,115.797,Office,11.412132,3.976946,1.291339,-3.134731,0.0,...,11.296875,0.5,2.746094,,0,4,53,1,,1
3,2,190.0,0.0,118.48,Education,11.828035,15.6,6.0,-5.6,0.0,...,6.199219,0.0,1.69043,,0,4,53,1,63.0,1
4,13,1142.0,1.0,0.0,Office,11.549441,-8.3,8.0,-12.2,-0.202532,...,12.398438,0.5,2.511719,,0,4,53,1,,1


## (5) One hot Encoding for Primary use

### One hot encoding for categorical feature

In [19]:
from LabelEncode import CategoricalTransformer

In [20]:
train_df.dropna(subset=['building_id'],inplace=True)


In [21]:
train_df = CategoricalTransformer().fit_transform(train_df)

In [22]:
train_df.head(5)

Unnamed: 0,site_id,building_id,meter,meter_reading,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,...,wind_speed_max_lag24,wind_speed_min_lag24,wind_speed_std_lag24,floor_area,hour,weekend,week,month,yr_built,IsHoliday
0,2,180.0,1.0,20.3274,0,10.109973,15.6,6.0,-5.6,0.0,...,6.199219,0.0,1.701172,,0,4,53,1,,1
1,2,257.0,0.0,43.15,0,11.233886,15.6,6.0,-5.6,0.0,...,8.796875,1.5,2.314453,,0,4,53,1,,1
2,14,1302.0,0.0,115.797,6,11.412132,3.976946,1.291339,-3.134731,0.0,...,11.296875,0.5,2.746094,,0,4,53,1,,1
3,2,190.0,0.0,118.48,0,11.828035,15.6,6.0,-5.6,0.0,...,6.199219,0.0,1.69043,,0,4,53,1,63.0,1
4,13,1142.0,1.0,0.0,6,11.549441,-8.3,8.0,-12.2,-0.202532,...,12.398438,0.5,2.511719,,0,4,53,1,,1


## (6) Remove Outliers
### Delete outliers for each meter type, this is determined by data exploration

In [23]:
from OutlierProcess import OutlierTransformer

In [24]:
train_df = OutlierTransformer(True,True,True).fit_transform(train_df)

In [25]:
len(train_df)

987852

## (7) Logrithm of target
### Apply log function to normalize target

In [26]:
target_smooth=True
if target_smooth:
    target = np.log1p(train_df["meter_reading"])
else:    
    target = train_df["meter_reading"]
features = train_df.drop(['meter_reading'], axis = 1)

In [27]:
features.index

RangeIndex(start=0, stop=987852, step=1)

<h1 style="color:#20B3CD;font-size:20px;float:left">Step 3  |  Algorithm Training</h1> <div style="float:right;height:7px;background-color:#20B3CD;margin-top:30px;width:70%"></div>

In [28]:
##LGBM is more accurate with categorical features specified first.
categorical_features = ["building_id", "site_id", "meter", "primary_use",'hour','week','month', 'yr_built',"weekend",'IsHoliday']

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}

kf = KFold(n_splits=3)
models = []
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=40)
    
    models.append(model)



Training until validation scores don't improve for 40 rounds
[25]	training's rmse: 1.02375	valid_1's rmse: 1.24761
[50]	training's rmse: 0.743092	valid_1's rmse: 1.1101
[75]	training's rmse: 0.633227	valid_1's rmse: 1.08794
[100]	training's rmse: 0.565382	valid_1's rmse: 1.08567
[125]	training's rmse: 0.513393	valid_1's rmse: 1.08638
[150]	training's rmse: 0.476178	valid_1's rmse: 1.08971
Early stopping, best iteration is:
[110]	training's rmse: 0.542913	valid_1's rmse: 1.08455
Training until validation scores don't improve for 40 rounds
[25]	training's rmse: 0.995052	valid_1's rmse: 1.31148
[50]	training's rmse: 0.708468	valid_1's rmse: 1.16146
[75]	training's rmse: 0.603457	valid_1's rmse: 1.11822
[100]	training's rmse: 0.541365	valid_1's rmse: 1.10383
[125]	training's rmse: 0.494964	valid_1's rmse: 1.09339
[150]	training's rmse: 0.459035	valid_1's rmse: 1.08776
[175]	training's rmse: 0.430097	valid_1's rmse: 1.08381
[200]	training's rmse: 0.407785	valid_1's rmse: 1.08235
[225]	train

<h1 style="color:#20B3CD;font-size:20px;float:left">Step 4  |  Model Packaging and Result Sharing</h1> <div style="float:right;height:7px;background-color:#20B3CD;margin-top:30px;width:70%"></div>

In [29]:
import joblib
# save model
joblib.dump(models[0], 'lgb.pkl')
# load model
gbm_pickle = joblib.load('lgb.pkl')

In [30]:
avg_score=0.
for model in models:
    avg_score += [list(i.values()) for i in model.best_score.values()][1][0]
final_avg = avg_score / 3.

In [31]:
## Print the message to python and send the message to s3
data = {'model_name':  ['lightGBM'],
        'avg_rmse accuracy': [final_avg]
        }

In [None]:
metric_df = pd.DataFrame (data, columns = ['model_name','avg_rmse accuracy'])
metric_df

In [38]:
from io import StringIO
from datetime import datetime
bucket='check-ride-data-explore'
csv_key='training_accuracy.csv'
pickle_key = 'lgb.pkl'
prefix = 'training_model_' + datetime.now().strftime('%Y-%m-%d') + "/"


s3 = boto3.resource('s3')
s3.Object(bucket, prefix + pickle_key).put(Body=open(pickle_key, 'rb'))

{'ResponseMetadata': {'RequestId': '8WFK6ZAQ6NFS8R1R',
  'HostId': 'UCblj0c3KdQwOVjJD70tGN19nx5mm3dAOsS4O0QDgv4oTHLP4TABqVtPtR3n47wYTS9wdQocDvY=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'UCblj0c3KdQwOVjJD70tGN19nx5mm3dAOsS4O0QDgv4oTHLP4TABqVtPtR3n47wYTS9wdQocDvY=',
   'x-amz-request-id': '8WFK6ZAQ6NFS8R1R',
   'date': 'Tue, 25 Aug 2020 19:36:27 GMT',
   'etag': '"ed014cd63cbac1b182fff851d42e11ea"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"ed014cd63cbac1b182fff851d42e11ea"'}

In [39]:
csv_buffer = StringIO()
metric_df.to_csv(csv_buffer)
s3.Object(bucket, prefix + csv_key).put(Body = csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '0B8AD1B6A0A6299D',
  'HostId': 'y0L/oXYRJ07nVNk1+ZZqayf35JptsvSQPnwZeQSOiAXIUNRWMw3x0cL3wYQNgLDFrSkhYcLRl2Y=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'y0L/oXYRJ07nVNk1+ZZqayf35JptsvSQPnwZeQSOiAXIUNRWMw3x0cL3wYQNgLDFrSkhYcLRl2Y=',
   'x-amz-request-id': '0B8AD1B6A0A6299D',
   'date': 'Tue, 25 Aug 2020 19:36:28 GMT',
   'etag': '"8649373772fd4a8b0d24edbe141eaf9f"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"8649373772fd4a8b0d24edbe141eaf9f"'}

<div style="height:60px;"><div style="height:7px;background-color:#20B3CD;width:100%;margin-top:20px;position:relative;"><img src="https://s3.amazonaws.com/iotanalytics-templates/Logo.png" style="height:50px;width:50px;margin-top:-20px;position:absolute;margin-left:42%;"></div></div>