<p>
    <img src="https://s3.amazonaws.com/iotanalytics-templates/Logo.png" style="float:left;">
    <h1 style="color:#1A5276;padding-left:115px;padding-bottom:0px;font-size:28px;">AWS IoT Analytics | Smart Building Energy Consumption</h1>
</p>
<p style="color:#1A5276;padding-left:90px;padding-top:0px;position:relative;font-style:italic;font-size:18px">
Application of Bench-marking Feature Engineering and AutoGluon to automate the model training for building energy consumption prediction.   
</p>

## Set-up: Import Required Notebook Libraries

In [1]:
#This notebook uses holidays package

try:
    import holidays
    import lightgbm as lgb
except:
    !pip install holidays
    import holidays
    !pip install lightgbm
    import lightgbm as lgb

In [2]:
import pandas as pd
import numpy as np
import boto3
import os
import sys
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc
from sklearn.metrics import mean_squared_error


from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline 

In [3]:
import warnings

warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
warnings.simplefilter(action='ignore', category=FutureWarning)

<h1 style="color:#20B3CD;font-size:20px;float:left">Step 1  |  Load Data from IoTAnalytics</h1> <div style="float:right;height:7px;background-color:#20B3CD;margin-top:30px;width:70%"></div>

In [4]:
# Before actually loading the data we need to set up an IoT Analytics client for accessing datasets.
# create IoT Analytics client
client = boto3.client('iotanalytics')
use_sample_dataset = True
dataset = "blogdemo_iot_highreso_dataset"

Now we can get the data location (URL) for the given dataset and start working with the data (In order to need to perform get_dataset_content, you need to grant iot analytics corresponding IAM permission):

In [5]:
# import target Data Set from AWS IoT Analytics service

dataset_url = client.get_dataset_content(datasetName = dataset)['entries'][0]['dataURI']
train_df = pd.read_csv(dataset_url,parse_dates=True)
if train_df.empty:
    raise Exception('No data found')
    
# start working with the data
drop_col = ['city','country','state','__dt']
train_df.drop(drop_col, axis=1, inplace=True) # removes unnecessary columns

In [6]:
train_df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,1241,1,2016-08-08 05:00:00,1755.09,14,Healthcare,194188,,,21.1,,17.8,0.0,1014.0,340.0,1.5
1,1241,0,2016-08-07 17:00:00,0.0,14,Healthcare,194188,,,28.3,0.0,15.6,0.0,1012.0,,
2,1241,2,2016-08-09 18:00:00,729.639,14,Healthcare,194188,,,28.9,,18.3,0.0,1022.0,0.0,0.0
3,1241,0,2016-08-12 19:00:00,0.0,14,Healthcare,194188,,,35.0,0.0,22.8,0.0,1010.5,230.0,4.1
4,1241,2,2016-07-23 04:00:00,580.037,14,Healthcare,194188,,,27.8,0.0,21.7,0.0,1010.0,240.0,2.6


<h1 style="color:#20B3CD;font-size:20px;float:left">Step 2  |  Feature Engineering</h1> <div style="float:right;height:7px;background-color:#20B3CD;margin-top:30px;width:70%"></div>

## (1) Weathertransformer

#### Added missing time-series data by finding start_date-end_date
#### Then fill in missed data invweather data, temperature, cloud coverage, due_temperature, sea_level, wind_direction, wind_speed, precip_depth

In [7]:
from weathertranformer import WeatherTranformer

In [8]:
train_df = WeatherTranformer(True).fit_transform(train_df)

## (2) Smoothing Filter

#### Smooth air and dew temperature

In [9]:
from SGFilter import SGFilterTranformer

In [10]:
train_df = SGFilterTranformer(True).fit_transform(train_df)

## (3) Rolling Window

#### Calculate min max std within a time window of 24

In [11]:
from Rollwindow import RollwinTranformer

In [12]:
train_df = RollwinTranformer(True,24).fit_transform(train_df)   

## (4) Numerical Features

#### Feature transform for Numerical Features

In [13]:
from NumericalEng import NumericalTransformer

In [14]:
train_df = NumericalTransformer(True, True, False, True, True, True, True).fit_transform(train_df)

## (4) Holidays Features

### Add one feature to state if that day is public holiday or not


In [15]:
from HolidayFea import HolidayTranformer

In [16]:
train_df = HolidayTranformer(True).fit_transform(train_df)


## (5) One hot Encoding for Primary use

### One hot encoding for categorical feature

In [17]:
from LabelEncode import CategoricalTransformer

In [18]:
train_df.dropna(subset=['building_id'],inplace=True)


In [19]:
train_df = CategoricalTransformer().fit_transform(train_df)

In [20]:
train_df.head(5)

Unnamed: 0,site_id,building_id,meter,meter_reading,primary_use,square_feet,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,...,wind_speed_max_lag24,wind_speed_min_lag24,wind_speed_std_lag24,floor_area,hour,weekend,week,month,yr_built,IsHoliday
0,1,144.0,3.0,9.7989,6,10.962562,3.8,0.0,2.4,0.0,...,8.796875,0.0,2.451172,11534.6,0,4,53,1,,1
1,4,618.0,0.0,83.6,0,11.905191,5.74382,1.146067,-5.159551,0.0,...,6.199219,0.0,1.8125,29606.4,0,4,53,1,96.0,1
2,14,1285.0,2.0,951.811,3,11.775813,3.976946,1.291339,-3.134731,0.0,...,7.199219,0.0,1.969727,,0,4,53,1,,1
3,6,776.0,2.0,0.0,6,11.558549,6.961111,0.163934,1.0875,0.0,...,5.816406,0.0,1.617188,,0,4,53,1,,1
4,2,158.0,0.0,19.49,6,9.223849,15.6,6.0,-5.6,0.0,...,4.101562,0.0,1.358398,,0,4,53,1,,1


## (6) Remove Outliers
### Delete outliers for each meter type, this is determined by data exploration

In [21]:
from OutlierProcess import OutlierTransformer

In [22]:
train_df = OutlierTransformer(True,True,True).fit_transform(train_df)

In [23]:
len(train_df)

987852

## (7) Logrithm of target
### Apply log function to normalize target

In [24]:
target_smooth=True
if target_smooth:
    target = np.log1p(train_df["meter_reading"])
else:    
    target = train_df["meter_reading"]
features = train_df.drop(['meter_reading'], axis = 1)

<h1 style="color:#20B3CD;font-size:20px;float:left">Step 3  |  Algorithm Training</h1> <div style="float:right;height:7px;background-color:#20B3CD;margin-top:30px;width:70%"></div>

In [25]:
##LGBM is more accurate with categorical features specified first.
categorical_features = ["building_id", "site_id", "meter", "primary_use",'hour','week','month', 'yr_built',"weekend",'IsHoliday']

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1280,
    "learning_rate": 0.05,
    "feature_fraction": 0.85,
    "reg_lambda": 2,
    "metric": "rmse",
}

kf = KFold(n_splits=3)
models = []
for train_index,test_index in kf.split(features):
    train_features = features.loc[train_index]
    train_target = target.loc[train_index]
    
    test_features = features.loc[test_index]
    test_target = target.loc[test_index]
    
    d_training = lgb.Dataset(train_features, label=train_target,categorical_feature=categorical_features, free_raw_data=False)
    d_test = lgb.Dataset(test_features, label=test_target,categorical_feature=categorical_features, free_raw_data=False)
    
    model = lgb.train(params, train_set=d_training, num_boost_round=1000, valid_sets=[d_training,d_test], verbose_eval=25, early_stopping_rounds=40)
    
    models.append(model)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10025
[LightGBM] [Info] Number of data points in the train set: 658568, number of used features: 49




[LightGBM] [Info] Start training from score 4.235999
Training until validation scores don't improve for 40 rounds
[25]	training's rmse: 1.04068	valid_1's rmse: 1.33678
[50]	training's rmse: 0.753421	valid_1's rmse: 1.21479
[75]	training's rmse: 0.637799	valid_1's rmse: 1.19316
[100]	training's rmse: 0.567605	valid_1's rmse: 1.18663
[125]	training's rmse: 0.518035	valid_1's rmse: 1.18128
[150]	training's rmse: 0.479349	valid_1's rmse: 1.1791
[175]	training's rmse: 0.449355	valid_1's rmse: 1.17829
[200]	training's rmse: 0.425713	valid_1's rmse: 1.17821
[225]	training's rmse: 0.405895	valid_1's rmse: 1.17741
[250]	training's rmse: 0.389034	valid_1's rmse: 1.17708
[275]	training's rmse: 0.374821	valid_1's rmse: 1.17712
[300]	training's rmse: 0.360755	valid_1's rmse: 1.17716
Early stopping, best iteration is:
[268]	training's rmse: 0.378875	valid_1's rmse: 1.17698
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10027
[LightGBM] [Info] Number of data po

<h1 style="color:#20B3CD;font-size:20px;float:left">Step 4  |  Model Packaging and Result Sharing</h1> <div style="float:right;height:7px;background-color:#20B3CD;margin-top:30px;width:70%"></div>

In [26]:
import joblib
# save model
joblib.dump(models[0], 'lgb.pkl')
# load model
gbm_pickle = joblib.load('lgb.pkl')

In [27]:
avg_score=0.
for model in models:
    avg_score += [list(i.values()) for i in model.best_score.values()][1][0]
final_avg = avg_score / 3.

In [28]:
## Print the message to python and send the message to s3
data = {'model_name':  ['lightGBM'],
        'avg_rmse accuracy': [final_avg]
        }

In [29]:
metric_df = pd.DataFrame (data, columns = ['model_name','avg_rmse accuracy'])
metric_df

Unnamed: 0,model_name,avg_rmse accuracy
0,lightGBM,1.146013


In [30]:
from io import StringIO
from datetime import datetime
bucket='blogdemo-ml-train-score-output'
csv_key='training_accuracy.csv'
pickle_key = 'lgb.pkl'
prefix = 'training_model_' + datetime.now().strftime('%Y-%m-%d') + "/"


s3 = boto3.resource('s3')
s3.Object(bucket, prefix + pickle_key).put(Body=open(pickle_key, 'rb'))

{'ResponseMetadata': {'RequestId': 'F8E556EE6FD1F24C',
  'HostId': 'en3dJibRLL7GY42ojXCFfoueEns9RGNhB2pHTxaJ4qyljQ1dvUHTi0ji/YY1b2gaSsOD+wDaRu4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'en3dJibRLL7GY42ojXCFfoueEns9RGNhB2pHTxaJ4qyljQ1dvUHTi0ji/YY1b2gaSsOD+wDaRu4=',
   'x-amz-request-id': 'F8E556EE6FD1F24C',
   'date': 'Tue, 22 Dec 2020 22:54:14 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"63f7287479a725590e6e129e615d3e6f"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"63f7287479a725590e6e129e615d3e6f"',
 'ServerSideEncryption': 'AES256'}

In [31]:
csv_buffer = StringIO()
metric_df.to_csv(csv_buffer)
s3.Object(bucket, prefix + csv_key).put(Body = csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '91ED59F05B189084',
  'HostId': 'fN4djYQdwihC7Zl2n889cgwahznMl5D/cKuzWMVPDWnossMyz64oXmJO9owpUUTfHjd2Me4HBp0=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'fN4djYQdwihC7Zl2n889cgwahznMl5D/cKuzWMVPDWnossMyz64oXmJO9owpUUTfHjd2Me4HBp0=',
   'x-amz-request-id': '91ED59F05B189084',
   'date': 'Tue, 22 Dec 2020 22:54:14 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"078b30585f45f8df8963c5b369ef06e6"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"078b30585f45f8df8963c5b369ef06e6"',
 'ServerSideEncryption': 'AES256'}

<div style="height:60px;"><div style="height:7px;background-color:#20B3CD;width:100%;margin-top:20px;position:relative;"><img src="https://s3.amazonaws.com/iotanalytics-templates/Logo.png" style="height:50px;width:50px;margin-top:-20px;position:absolute;margin-left:42%;"></div></div>