In [1]:
# Datset source
# https://archive.ics.uci.edu/ml/datasets/Appliances+energy+prediction

In [2]:
# Problem statement: Predict the appliances energy use based on various features

In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [4]:
# Read the dataset

import pandas as pd
pd.options.display.max_columns = 1000
aep_df = pd.read_csv('energydata_complete.csv', sep=',')
print(aep_df.shape)
aep_df.head()

(19735, 29)


Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [5]:
# Check for NAN values in the entire dataframe

aep_df.isnull().sum().sum()

0

In [6]:
# To make this notebook's output identical at every run

np.random.seed(2)

In [7]:
# Split the dataframe into features and labels

X = aep_df.drop(['date', 'Appliances'], axis=1).values
y = aep_df.loc[:, 'Appliances'].values
print("X shape: ", X.shape, "y shape: ", y.shape)
print("Sample X values: ", X[:5], "\n", "Sample y values: ", y[:5])

X shape:  (19735, 27) y shape:  (19735,)
Sample X values:  [[ 30.          19.89        47.59666667  19.2         44.79
   19.79        44.73        19.          45.56666667  17.16666667
   55.2          7.02666667  84.25666667  17.2         41.62666667
   18.2         48.9         17.03333333  45.53         6.6
  733.5         92.           7.          63.           5.3
   13.27543316  13.27543316]
 [ 30.          19.89        46.69333333  19.2         44.7225
   19.79        44.79        19.          45.9925      17.16666667
   55.2          6.83333333  84.06333333  17.2         41.56
   18.2         48.86333333  17.06666667  45.56         6.48333333
  733.6         92.           6.66666667  59.16666667   5.2
   18.60619498  18.60619498]
 [ 30.          19.89        46.3         19.2         44.62666667
   19.79        44.93333333  18.92666667  45.89        17.16666667
   55.09         6.56        83.15666667  17.2         41.43333333
   18.2         48.73        17.          45.5   

In [8]:
# Split the dataset into train and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=2)

print(" X_train shape: ", X_train.shape,"\n", "y_train shape: ", y_train.shape,"\n",
        "X_test shape: ", X_test.shape,"\n", "y_test shape: ", y_test.shape,"\n")

 X_train shape:  (18748, 27) 
 y_train shape:  (18748,) 
 X_test shape:  (987, 27) 
 y_test shape:  (987,) 



In [9]:
# Model 1
# Sklearn DecisionTreeRegressor model with max_depth 10

from sklearn.tree import DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(max_depth=10, random_state=2)
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=10, random_state=2)

In [10]:
# R^2 values for train and test sets

print("Train set R^2 score: ", dt_reg.score(X_train, y_train))
print("Test set R^2 score: ", dt_reg.score(X_test, y_test))

Train set R^2 score:  0.5675554705629198
Test set R^2 score:  0.1617048030466639


In [11]:
# Mean Squared Errors of train and test sets

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, dt_reg.predict(X_train)))
print("Test set mse: ", mean_squared_error(y_test, dt_reg.predict(X_test)))

Train set mse:  4579.141710223829
Test set mse:  7555.657015003766


In [12]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, dt_reg.predict(X_train)))
print("Test set mae: ", mean_absolute_error(y_test, dt_reg.predict(X_test)))

Train set mae:  35.22153983411054
Test set mae:  45.18473888723902


In [13]:
# The Decision Tree Regressor with max depth 10 achieves a low R^2 score, so complex models will be developed

In [14]:
# Model 2
# Sklearn DecisionTreeRegressor model with RandomizedSearchCV

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

param_distributions = {'max_depth': [10, 20, 40, 50, 60, 70], 'min_samples_split': [30, 40, 50, 70, 90], 'max_leaf_nodes': list(range(2, 10000))}
dt_reg_rnd_search_cv = RandomizedSearchCV(DecisionTreeRegressor(random_state=2), param_distributions, n_iter=1000, n_jobs=10, verbose=5, cv=3, random_state=2)
dt_reg_rnd_search_cv.fit(X_train, y_train)

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


RandomizedSearchCV(cv=3, estimator=DecisionTreeRegressor(random_state=2),
                   n_iter=1000, n_jobs=10,
                   param_distributions={'max_depth': [10, 20, 40, 50, 60, 70],
                                        'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8,
                                                           9, 10, 11, 12, 13,
                                                           14, 15, 16, 17, 18,
                                                           19, 20, 21, 22, 23,
                                                           24, 25, 26, 27, 28,
                                                           29, 30, 31, ...],
                                        'min_samples_split': [30, 40, 50, 70,
                                                              90]},
                   random_state=2, verbose=5)

In [15]:
dt_reg_rnd_search_cv.best_estimator_

DecisionTreeRegressor(max_depth=20, max_leaf_nodes=788, min_samples_split=50,
                      random_state=2)

In [16]:
# R^2 values for train and test sets

print("Train set R^2 score: ", dt_reg_rnd_search_cv.best_estimator_.score(X_train, y_train))
print("Test set R^2 score: ", dt_reg_rnd_search_cv.best_estimator_.score(X_test, y_test))

Train set R^2 score:  0.6191046762686883
Test set R^2 score:  0.20138790526263317


In [17]:
# Mean Squared Errors of train and test sets

from sklearn.metrics import mean_squared_error
print("Train set mse: ", mean_squared_error(y_train, dt_reg_rnd_search_cv.best_estimator_.predict(X_train)))
print("Test set mse: ", mean_squared_error(y_test, dt_reg_rnd_search_cv.best_estimator_.predict(X_test)))

Train set mse:  4033.288769770486
Test set mse:  7197.988367103962


In [18]:
# Mean Absolute Errors of train and test sets

from sklearn.metrics import mean_absolute_error
print("Train set mae: ", mean_absolute_error(y_train, dt_reg_rnd_search_cv.best_estimator_.predict(X_train)))
print("Test set mae: ", mean_absolute_error(y_test, dt_reg_rnd_search_cv.best_estimator_.predict(X_test)))

Train set mae:  30.558764984929994
Test set mae:  42.35952079081227
