In [None]:
# Author: @SpencerAndTheMatt
# Energy prediction 3 (ASHRAE)
# Imports/Formatting
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import utils
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from tensorflow import keras


import warnings
warnings.filterwarnings('ignore')


In [None]:
'''
README
With getting processed data, this programme reads in as feather, and then converts
it to CSV. Google colab RAM cannot handle reading them both in as CSV.
'''

'\nREADME\nWith getting processed data, this programme reads in as feather, and then converts\nit to CSV. Google colab RAM cannot handle reading them both in as CSV.\n'

In [None]:
# Get PROCESSED train data from drive 
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
train_data = pd.read_feather('/content/drive/My Drive/training_data_processed_feather.feather')

Mounted at /content/drive


In [None]:
# Get PROCESSED test data from drive
test_data = pd.read_feather('/content/drive/My Drive/testing_data_processed_feather.feather')

In [None]:
# Convert training data back to CSV file
train_data.to_csv('train_data', index = False)

In [None]:
# Convert testing data back to CSV file
test_data.to_csv('test_data', index = False)

In [None]:
# Check type
print('Type of train_data: ', type(train_data))
print('Type of test_data: ', type(test_data))

Type of train_data:  <class 'pandas.core.frame.DataFrame'>
Type of test_data:  <class 'pandas.core.frame.DataFrame'>


In [None]:
# Drop index columns
train_data = train_data.drop('index', axis = 1, errors = 'ignore')
test_data = test_data.drop('index', axis = 1, errors = 'ignore')

In [None]:
# Inspect train_data
train_data.head()

Unnamed: 0,building_id,square_feet,primary_use,meter,air_temperature,day_of_year,hour,isDayTime,day_of_week,meter_reading
0,46,9.109375,11,0,25.0,1.0,0.0,0,4.0,3.994141
1,74,12.867188,8,0,25.0,1.0,0.0,0,4.0,3.785156
2,93,10.414062,6,0,25.0,1.0,0.0,0,4.0,3.978516
3,105,10.835938,0,0,3.800781,1.0,0.0,0,4.0,3.191406
4,106,8.585938,0,0,3.800781,1.0,0.0,0,4.0,0.318115


In [None]:
# Inspect test_data
test_data.head()

Unnamed: 0,building_id,square_feet,primary_use,meter,air_temperature,day_of_year,hour,isDayTime,day_of_week
0,0,8.914062,0,0,17.796875,1.0,0.0,0,6.0
1,1,7.910156,0,0,17.796875,1.0,0.0,0,6.0
2,2,8.59375,0,0,17.796875,1.0,0.0,0,6.0
3,3,10.070312,0,0,17.796875,1.0,0.0,0,6.0
4,4,11.664062,0,0,17.796875,1.0,0.0,0,6.0


In [None]:
'''
--------------------------------------------------------------------------------
IMPLEMENTATION
'''

'\n--------------------------------------------------------------------------------\nIMPLEMENTATION\n'

In [None]:
# Get X_train and y_train
X_train = train_data.drop('meter_reading', axis = 1)
y_train = train_data['meter_reading'].values

In [None]:
 # Split data for test and train
train_x, test_x, train_y, test_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [None]:
# Define RMSLE specifically for neural network
from keras import backend as k
def NN_RMSLE(y_act, y_pred):
  return k.sqrt(k.mean(k.square(y_pred - y_act)))

In [None]:
# Split into train and validation sets
train_xx, val_xx, train_yy, val_yy = train_test_split(train_x, train_y, test_size = 0.2, random_state = 42)

In [None]:
# Define model
from keras.models import Sequential
from keras import layers
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

model = Sequential()

# Add layers, etc
model.add(layers.Dense(512, activation = 'relu', input_shape = (train_xx.shape[1], )))
model.add(Dense(1, activation = 'linear'))
model.compile(optimizer = 'adam', loss = NN_RMSLE)

# Fit model
model.fit(train_xx, train_yy, epochs = 50, batch_size = 2048, validation_data = (val_xx, val_yy))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f1a53487a90>

In [None]:
# Use model to make a prediction 
prediction = model.predict(test_data)

In [None]:
# Show prediction
prediction

In [None]:
# Inspect and flatten prediction
prediction_flat = prediction.flatten()
len(prediction_flat)

In [None]:
# Create row_id column for Kaggle submission
row_id = (np.arange(0, len(prediction_flat))).astype(np.int32)

In [None]:
# Verify row_id is the required ~ 41 million entries
len(row_id)

In [None]:
#df = pd.DataFrame({'row_id':row_id, 'meter_reading':prediction_flat})

In [None]:
#df.to_csv('/content/drive/My Drive/RNN_prediction.csv', index = False)

In [None]:
#df.head()

In [None]:
# Construct dataframe
df_2 = pd.DataFrame({'row_id': row_id, 'meter_reading':np.expm1(prediction_flat)})

In [None]:
# Save as csv
df_2.to_csv('/content/drive/My Drive/RNN_prediction_expm1.csv', index = False)

In [None]:
# Accuracy metrics
from sklearn import metrics as met
r2_epochs20 = met.r2_score(train_yy[0:2000000], model.predict(train_xx[0:2000000]))

In [None]:
# Show metrics
print('R2 score for model is {}'.format(r2_epochs20))
print('Training RMSLE is {}'.format(error_train))
print('Test RMSLE is {}'.format(error_test))

R2 score for model is 0.6196341341432954
Training RMSLE is 1.0738378763198853
Test RMSLE is 1.0738378763198853
