# Project:  Appliances Energy Prediction Dataset

## Dataset Description
#### The dataset is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters). The dataset contains 19735 rows and 29 columns used to answer certain questions and give insights on the data gathered.

In [1]:
#import libraries needed for exploration
#load the dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

energy_df =pd.read_csv("energydata.csv")

In [2]:
#read the dataset
energy_df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,1/11/2016 17:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,1/11/2016 17:10,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.48,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,1/11/2016 17:20,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.37,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,1/11/2016 17:30,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.41039,45.41039
4,1/11/2016 17:40,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.13,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [3]:
#check dataset dimension
energy_df.shape

(19735, 29)

In [4]:
#check columns summary
energy_df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.41258,755.522602,79.750418,4.039752,38.330834,3.760995,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.318464,7.399441,14.901088,2.451221,11.794719,4.195248,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.67,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.92,756.1,83.666667,3.666667,40.0,3.43,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.4,760.933333,91.666667,5.5,40.0,6.57,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [5]:
#check dataset info
energy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [6]:
#linear regression model
from sklearn.linear_model import LinearRegression 
#import M.A.E
from sklearn.metrics import mean_absolute_error 
#normalising dataset into a common scale using Min and Max.
from sklearn.preprocessing import MinMaxScaler 
#splitting dataset into training and testing df
from sklearn.model_selection import train_test_split       

#Residual Sum of Squares(RSS)
from sklearn.metrics import mean_squared_error  
#R- Squares
from sklearn.metrics import r2_score                      


from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [7]:
#what is the R^2 value in two d.p when you fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6)
x = energy_df["T2"]
y = energy_df["T6"]

In [8]:
#reshape the x variable and split our data into train and test set
arr = np.array(x)
x = arr.reshape(-1, 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

#fit the model
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# calculate R_squared value
r2 = r2_score(y_test, y_pred)
round(r2, 2)

0.64

In [10]:
#What is the Mean Absolute Error when you Normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set
#remove date and lights columns
df = energy_df.drop(['lights', 'date'], axis = 1)

scaler = MinMaxScaler()
norm_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

features_df = norm_df.drop(columns = ['Appliances'], axis=1)
target = norm_df['Appliances']

#Using a 70-30 train-test set split with a random state of 42
x_train, x_test, y_train, y_test = train_test_split(features_df, target, train_size=0.7, test_size = 0.3, random_state=42)

#check and fit model
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

MAE = mean_absolute_error(y_test, y_pred)
round(MAE, 2)

0.05

In [11]:
#check the Residual Sum of Squares
Residual_sum_squares = np.sum(np.square(y_test - y_pred))
round(Residual_sum_squares,2)

45.35

In [12]:
#check the the Root Mean Squared Error
MSE = mean_squared_error(y_test, y_pred)
Root_mean_squared_error = np.sqrt(MSE)
round(Root_mean_squared_error, 3)

0.088

In [13]:
#check the Coefficient of Determination
Coefficient_det = r2_score(y_test, y_pred)
round(Coefficient_det, 2)

0.15

In [15]:
#check the feature weights from your linear model above. Which features have the lowest and highest weights by defining a function to return the weight of every feature in the dataset
def weights(model, feat, col_name):
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df


linear_weights = weights(model, x_train, 'Linear_model_Weight')
linear_weights

Unnamed: 0,Features,Linear_model_Weight
0,RH_2,-0.456662
1,T_out,-0.321969
2,T2,-0.236198
3,T9,-0.189977
4,RH_8,-0.157585
5,RH_out,-0.077748
6,RH_7,-0.04462
7,RH_9,-0.039808
8,T5,-0.015645
9,T1,-0.003242


The lowest and highest weights are RH_2 and RH_1

In [20]:
#Train a ridge regression model with an alpha value of 0.4
ridge_reg_model = Ridge(alpha=0.4)
#fit the model
ridge_reg_model.fit(x_train, y_train)
#predict the model
ridge_pred = ridge_reg_model.predict(x_test)

#check if there is there any change to the root mean squared error (RMSE) when evaluated on the test set
ridge_RMSE = np.sqrt(mean_squared_error(y_test, ridge_pred))
round(Root_mean_squared_error, 3)

0.088

In [19]:
#Train a lasso regression model with an alpha value of 0.001
lasso_0001 = Lasso(alpha = 0.001)
#fit
lasso_0001.fit(x_train, y_train)
#predict
lasso_0001_pred = lasso_0001.predict(x_test)
train_score = lasso_0001.score(x_train, y_train)
test_score = lasso_0001.score(x_test, y_test)

#obtain the new feature weights
lasso_weights = weights(lasso_0001, x_train, 'Lasso_weight_feature')

#How many of the features have non-zero feature weights?
non_zero_features = lasso_weights["Lasso_weight_feature"].astype(bool).sum(axis=0)
non_zero_features

4

In [22]:
#check the new RMSE with the lasso regression
MSE = mean_squared_error(y_test, lasso_0001_pred)
New_RMSE = np.sqrt(MSE)
round(New_RMSE, 3)

0.094