<a href="https://colab.research.google.com/github/Law101/predicting_energy_efficiency/blob/master/Machine_Learning_Regression_Predicting_Energy_Efficiency_of_Buildings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning: Regression - Predicting Energy Efficiency of Buildings
Name: Lawrence Okegbemi

### Load Neccessary Libraries

In [1]:
import numpy as np              # For numerical python computation
import pandas as pd             # For handling tabular dataset
import matplotlib.pyplot as plt # Plotting and Visualization
import seaborn as sns           # Plotting and Visualization

from sklearn.preprocessing import  MinMaxScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

%matplotlib inline

  import pandas.util.testing as tm


### Import the Dataset

In [2]:
# Get the Data from URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv"
dataset = pd.read_csv(url)

### Exploratory Data Analysis

In [3]:
# Shape of the dataset with ROWS X COLUMNS Respectively
dataset.shape

(19735, 29)

In [4]:
# Information about the Columns in the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [5]:
#Descriptions about the Data
dataset.describe(include="all")

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
unique,19735,,,,,,,,,,,,,,,,,,,,,,,,,,,,
top,2016-01-17 19:40:00,,,,,,,,,,,,,,,,,,,,,,,,,,,,
freq,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,
mean,,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,19.592106,50.949283,7.910939,54.609083,20.267106,35.3882,22.029107,42.936165,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,1.844623,9.022034,6.090347,31.149806,2.109993,5.114208,1.956162,5.224361,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,15.33,29.815,-6.065,1.0,15.39,23.2,16.306667,29.6,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,18.2775,45.4,3.626667,30.025,18.7,31.5,20.79,39.066667,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,19.39,49.09,7.3,55.29,20.033333,34.863333,22.1,42.375,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,20.619643,53.663333,11.256,83.226667,21.6,39.0,23.39,46.536,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769


In [6]:
#Dropping `date` and `light` columns
dataset.drop(['date','lights'],axis=1,inplace=True)

#Feature Scalling using MinMax
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(dataset)
train_feat = pd.DataFrame(train_scaled, columns=dataset.columns)


In [7]:
#X and y
X1 = train_feat[['T2']]
y1 = train_feat['T6']

In [8]:
#Model Training
lr = LinearRegression()
lr.fit(X1,y1)
#Print Model Score
print(f'Model R2_Score: {round(lr.score(X1,y1),2)}')

Model R2_Score: 0.64


In [9]:
#Select Train and test features
X = train_feat.drop('Appliances', axis=1)
y = train_feat['Appliances']

In [10]:
#Splitting Dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f'Shape of Training Set is {X_train.shape}')
print(f'Shape of Testing Set is {X_test.shape}')

Shape of Training Set is (13814, 26)
Shape of Testing Set is (5921, 26)


In [11]:
#Model Training
lr.fit(X_train,y_train)

#Predicting for Test set
y_pred = lr.predict(X_test)

In [12]:
# Errors
R2 = r2_score(y_test,y_pred)
MAE = mean_absolute_error(y_test, y_pred)
RSS = np.sum(np.square(y_test - y_pred))
RMSE = np.sqrt(mean_squared_error(y_test,y_pred))

print(f'Testing ---- Mean Absolute Error MAE {round(MAE,2)}')
print(f'Testing ---- Root Mean Squarred Error RMSE {round(RMSE,3)}')
print(f'Testing ---- R-Squarred {round(R2,2)}')
print(f'Testing ---- Residual Sum of Squares {round(RSS,2)}')

Testing ---- Mean Absolute Error MAE 0.05
Testing ---- Root Mean Squarred Error RMSE 0.088
Testing ---- R-Squarred 0.15
Testing ---- Residual Sum of Squares 45.35


In [13]:
#finding min and max of coefficients
coef_order = pd.DataFrame(zip(X.columns, lr.coef_),
            columns=['features','coefficients']).sort_values('coefficients')

In [14]:
coef_order

Unnamed: 0,features,coefficients
3,RH_2,-0.456698
18,T_out,-0.32186
2,T2,-0.236178
16,T9,-0.189941
15,RH_8,-0.157595
20,RH_out,-0.077671
13,RH_7,-0.044614
17,RH_9,-0.0398
8,T5,-0.015657
0,T1,-0.003281


In [15]:
#Ridge (L2)
rg = Ridge(alpha=0.4)
rg.fit(X_train,y_train)
y_pred_rg = rg.predict(X_test)

#Ridge RMSE
Ridge_RMSE = np.sqrt(mean_squared_error(y_test,y_pred_rg))
print(f'Ridge_RMSE {round(Ridge_RMSE,3)}')

Ridge_RMSE 0.088


In [16]:
#LASSO (L1)
ls = Lasso(alpha=0.001)
ls.fit(X_train,y_train)
y_pred_ls = ls.predict(X_test)

In [19]:
#getting the coefficients
len(ls.coef_[ls.coef_ !=0])

4

In [20]:
#calculating for the RMSE
LASSO_RMSE = np.sqrt(mean_squared_error(y_test,y_pred_ls))
print(f'LASSO RMSE {round(LASSO_RMSE,3)}')

LASSO RMSE 0.094
