In [1]:
# Energy prediction 3 (ASHRAE)
# Imports/Formatting
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor,Ridge,ElasticNet
from sklearn.model_selection import train_test_split,KFold,GroupKFold
import lightgbm as lgb
import gc
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn. linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout, Activation
from tensorflow.keras.optimizers import *

from prettytable import PrettyTable

import warnings
warnings.filterwarnings('ignore')


In [2]:
'''
README
With getting processed data, this programme reads in as feather, and then converts
it to CSV. Google colab RAM cannot handle reading them both in as CSV.
'''

'\nREADME\nWith getting processed data, this programme reads in as feather, and then converts\nit to CSV. Google colab RAM cannot handle reading them both in as CSV.\n'

In [3]:
# Get PROCESSED train data from drive 
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
train_data = pd.read_feather('/content/drive/My Drive/training_data_processed_feather.feather')

Mounted at /content/drive


In [4]:
# Get PROCESSED test data from drive
test_data = pd.read_feather('/content/drive/My Drive/testing_data_processed_feather.feather')

In [5]:
# Convert training data back to CSV file
train_data.to_csv('train_data', index = False)

In [6]:
# Convert testing data back to CSV file
test_data.to_csv('test_data', index = False)

In [7]:
# Check type
print('Type of train_data: ', type(train_data))
print('Type of test_data: ', type(test_data))

Type of train_data:  <class 'pandas.core.frame.DataFrame'>
Type of test_data:  <class 'pandas.core.frame.DataFrame'>


In [8]:
# Drop index columns
train_data = train_data.drop('index', axis = 1, errors = 'ignore')
test_data = test_data.drop('index', axis = 1, errors = 'ignore')

In [9]:
# Inspect train_data
train_data.head()

Unnamed: 0,building_id,square_feet,primary_use,meter,air_temperature,day_of_year,hour,isDayTime,day_of_week,meter_reading
0,46,9.109375,11,0,25.0,1.0,0.0,0,4.0,3.994141
1,74,12.867188,8,0,25.0,1.0,0.0,0,4.0,3.785156
2,93,10.414062,6,0,25.0,1.0,0.0,0,4.0,3.978516
3,105,10.835938,0,0,3.800781,1.0,0.0,0,4.0,3.191406
4,106,8.585938,0,0,3.800781,1.0,0.0,0,4.0,0.318115


In [10]:
# Inspect test_data
test_data.head()

Unnamed: 0,building_id,square_feet,primary_use,meter,air_temperature,day_of_year,hour,isDayTime,day_of_week
0,0,8.914062,0,0,17.796875,1.0,0.0,0,6.0
1,1,7.910156,0,0,17.796875,1.0,0.0,0,6.0
2,2,8.59375,0,0,17.796875,1.0,0.0,0,6.0
3,3,10.070312,0,0,17.796875,1.0,0.0,0,6.0
4,4,11.664062,0,0,17.796875,1.0,0.0,0,6.0


In [11]:
'''
--------------------------------------------------------------------------------
IMPLEMENTATION
'''

'\n--------------------------------------------------------------------------------\nIMPLEMENTATION\n'

In [12]:
# Get X_train and y_train
X_train = train_data.drop('meter_reading', axis = 1)
y_train = train_data['meter_reading'].values

In [13]:
# Baseline model - solely useful for comparison against other (hopefully better) models
def baselineModel(y_act, y_pred):
  '''
  Baseline score is computed from median value
  '''

  # Define root mean square error (RMSE)
  rmsle_score = np.sqrt(np.mean((y_act - y_pred) * (y_act - y_pred)))
  return 'RMSE score of baseline model is {}'.format(rmsle_score)

print(baselineModel(y_train, np.median(y_train)))

RMSE score of baseline model is 1.7724609375


In [14]:
 # Split data for test and train
train_x, test_x, train_y, test_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [15]:
# Define RMSLE
def RMSLE(y_act, y_pred):
  return np.sqrt(mean_squared_error(y_act, y_pred))

In [16]:
# Define RMSLE specifically for neural network
from keras import backend as k
def NN_RMSLE(y_act, y_pred):
  return k.sqrt(k.mean(k.square(y_pred - y_act)))

In [17]:
# Split into train and validation sets
train_xx, val_xx, train_yy, val_yy = train_test_split(train_x, train_y, test_size = 0.2, random_state = 42)

In [49]:
# Define model
from keras.callbacks import EarlyStopping

model = Sequential()

early_stop = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 3)

# Add layers, etc
model.add(layers.Dense(512, activation = 'relu', input_shape = (train_xx.shape[1], )))
model.add(Dense(1, activation = 'linear'))
model.compile(optimizer = 'adam', loss = NN_RMSLE)

# Fit model
model.fit(train_xx, train_yy, epochs = 100, batch_size = 2048, validation_data = (val_xx, val_yy), callbacks = early_stop)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


<keras.callbacks.History at 0x7fdb683c6ad0>

In [50]:
print('Neural Network Training RMSLE: {}'.format(model.evaluate(train_x, train_y, verbose = 0)))
print('Neural Network Testing RMSLE: {}'.format(model.evaluate(test_x, test_y, verbose = 0)))

Neural Network Training RMSLE: 1.0909289121627808
Neural Network Testing RMSLE: 1.0917843580245972


In [51]:
# Use model to make a prediction 
prediction = model.predict(test_data)

In [52]:
prediction

array([[0.9350722 ],
       [0.21203923],
       [0.9306054 ],
       ...,
       [1.7016139 ],
       [3.4919171 ],
       [4.3557076 ]], dtype=float32)

In [23]:
# Inspect prediction
prediction_flat = prediction.flatten()
len(prediction_flat)

41697600

In [53]:
row_id = (np.arange(0, len(prediction_flat))).astype(np.int32)

In [54]:
len(row_id)

41697600

In [25]:
#df = pd.DataFrame({'row_id':row_id, 'meter_reading':prediction_flat})

In [29]:
#df.to_csv('/content/drive/My Drive/RNN_prediction.csv', index = False)

In [30]:
#df.head()

Unnamed: 0,row_id,meter_reading
0,0.0,1.611253
1,1.0,1.030314
2,2.0,1.03964
3,3.0,1.439468
4,4.0,1.781921


In [55]:
df_2 = pd.DataFrame({'row_id': row_id, 'meter_reading':np.expm1(prediction_flat)})

In [56]:
df_2.to_csv('/content/drive/My Drive/RNN_prediction_expm1.csv', index = False)