In [1]:
#importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

%matplotlib inline

In [2]:
#loading the 'energy data' csv file into a Pandas DataFrame
#and viewing 20 random samples

energydata_df = pd.read_csv('energydata_complete.csv')
energydata_df.sample(20)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
1255,2016-01-20 10:10:00,90,10,18.2,38.96,16.89,39.29,18.5,38.06,16.89,...,15.89,36.326667,-1.95,760.1,88.833333,2.166667,58.166667,-3.566667,9.12815,9.12815
8840,2016-03-13 02:20:00,30,0,20.926667,35.7,17.666667,39.23,20.2,36.466667,19.7,...,18.926667,39.7,0.566667,768.266667,88.333333,1.0,48.0,-1.2,20.794958,20.794958
18920,2016-05-22 02:20:00,50,0,24.644545,50.175091,24.158559,48.209159,26.934909,44.394545,23.952252,...,23.193273,48.854545,17.733333,750.466667,77.0,2.666667,40.0,13.6,7.2457,7.2457
17544,2016-05-12 13:00:00,60,0,25.29,47.4,26.1,43.53375,26.142857,43.712857,25.5,...,24.1,46.738571,19.8,744.2,61.0,2.0,24.0,12.0,39.235493,39.235493
14538,2016-04-21 16:00:00,110,0,23.066667,34.93,23.29,32.09,23.0,35.363333,22.89,...,20.29,36.59,18.8,760.3,36.0,4.0,40.0,3.5,44.581161,44.581161
11175,2016-03-29 07:30:00,60,0,22.0,38.5,18.945,42.79,22.79,37.56,20.29,...,20.5,42.76,5.05,749.1,85.5,8.0,40.0,2.8,29.155496,29.155496
11919,2016-04-03 11:30:00,60,0,21.4175,43.9,23.856667,38.7,21.89,40.7,21.23,...,20.0,42.59,16.3,752.4,71.5,4.0,34.5,11.15,13.766566,13.766566
12953,2016-04-10 15:50:00,60,0,23.1,38.2,22.89,36.29,22.73,38.163333,22.1,...,20.79,39.56,14.133333,751.783333,53.333333,4.166667,40.0,4.766667,5.393236,5.393236
5997,2016-02-22 08:30:00,140,0,21.166667,44.626667,19.7,45.0,21.79,43.09,20.6,...,18.68,47.4,9.0,751.8,91.5,7.5,64.0,7.7,9.727118,9.727118
5026,2016-02-15 14:40:00,60,0,20.7,38.466667,19.39,37.9,21.76,37.76,19.1,...,18.166667,41.933333,5.566667,760.866667,61.333333,7.0,40.0,-1.4,25.065831,25.065831


In [3]:
#checking the DataFrame for null fields and also the Dtype of each feature

energydata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

In [4]:
#checking for common stats of each feature

energydata_df.describe()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


In [5]:
#creating a Scaler to normalize our Data 
#(to avoid certain features having more weights than the others in our model)

scaler = MinMaxScaler()

In [6]:
relevant_energydata_df = energydata_df.drop(columns=['date', 'lights'])

In [7]:
#normalizing the data in the DataFrame (which now only includes features we need)

normalised_energydata_df = pd.DataFrame(scaler.fit_transform(relevant_energydata_df), columns=relevant_energydata_df.columns)
normalised_energydata_df

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.226500,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,0.084112,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,0.752031,...,0.864724,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981
19731,0.074766,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,0.754897,...,0.864724,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726
19732,0.242991,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,0.754897,...,0.864724,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979
19733,0.383178,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,0.752031,...,0.864724,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371


In [8]:
#creating a DataFrame of the features that would be fed to our model
#(they are used to predict our target)

features_df = normalised_energydata_df.drop(columns=['Appliances'])
features_df

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,0.381691,...,0.223032,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449
1,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,0.175506,0.381691,...,0.226500,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083
2,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,0.380037,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,0.380037,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,0.380037,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,0.752031,0.339590,...,0.864724,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981
19731,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,0.754897,0.338487,...,0.864724,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726
19732,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,0.754897,0.337585,...,0.864724,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979
19733,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,0.752031,0.336583,...,0.864724,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371


In [9]:
#creating the target (the values we would like our regressor to predict)

target = normalised_energydata_df['Appliances']
target

0        0.046729
1        0.046729
2        0.037383
3        0.037383
4        0.046729
           ...   
19730    0.084112
19731    0.074766
19732    0.242991
19733    0.383178
19734    0.392523
Name: Appliances, Length: 19735, dtype: float64

In [10]:
#splitting our data into a portion for training the model (70%)
#and a portion for testing our model's performance (30%)

X_train, X_test, y_train, y_test = train_test_split(features_df, target, test_size=0.3, random_state=42)

In [11]:
len(features_df)

19735

In [12]:
len(X_train), len(X_test)

(13814, 5921)

In [13]:
len(target)

19735

In [14]:
len(y_train), len(y_test)

(13814, 5921)

In [15]:
#creating a Linear Regressor
#and fitting it to our training data

lin_reg = LinearRegression().fit(X_train, y_train)

Testing our model's performance:

In [16]:
#checking the R^2 score on the training data

lin_reg.score(X_train, y_train)

0.14471942308518726

In [17]:
#checking the R^2 score on the testing data

lin_reg.score(X_test, y_test)

0.14890246319303535

In [18]:
r2_score(y_test, lin_reg.predict(X_test))

0.14890246319303535

Checking the $R^2$ score on both the training and testing data is important to detect over-fitting or under-fitting.
If we get a high $R^2$ score on the training data but a low score on the testing data, our model is most likely over-fitting.
If we get low $R^2$ scores on both the training and testing data, our model is most likely under-fitting.

In [22]:
#creating a Lasso Regressor with alpha = 0.001
#and fitting it to our training data

lasso_reg = Lasso(alpha=0.001).fit(X_train, y_train)

In [23]:
lasso_reg.score(X_train, y_train)

0.02499234354320612

In [24]:
lasso_reg.score(X_test, y_test)

0.026800880567125818

In [33]:
#creating a Ridge Regressor with alpha = 0.4
#and fitting it to our training data

ridge_reg = Ridge(alpha=0.4).fit(X_train, y_train)

In [34]:
ridge_reg.score(X_train, y_train)

0.14450591897531706

In [35]:
ridge_reg.score(X_test, y_test)

0.14852493545092582

### Question 12:

In [41]:
#creating a Linear Regressor and fitting it to "X='T2' and y='T6'"

lin_reg_2 = LinearRegression().fit(np.array(normalised_energydata_df['T2']).reshape(-1,1), np.array(normalised_energydata_df['T6']).reshape(-1,1))

In [42]:
lin_reg_2.score(np.array(normalised_energydata_df['T2']).reshape(-1,1), np.array(normalised_energydata_df['T6']).reshape(-1,1))

0.6418990830855493

In [43]:
#checking the R-squared score of the new Linear model

r2_score(np.array(normalised_energydata_df['T6']).reshape(-1,1), lin_reg_2.predict(np.array(normalised_energydata_df['T2']).reshape(-1,1)))

0.6418990830855493

### Question 13:

In [28]:
#checking the Mean absolute error metric of our initial Linear model

mean_absolute_error(y_test, lin_reg.predict(X_test))

0.05013362397742954

### Question 14:

In [30]:
#checking the Residual sum of squares metric of our initial Linear model

np.sum(np.square(y_test - lin_reg.predict(X_test)))

45.34762967266377

### Question 15:

In [29]:
#checking the Root mean squared error metric of our initial Linear model

np.sqrt(mean_squared_error(y_test, lin_reg.predict(X_test)))

0.0875144494766171

### Question 16:

In [44]:
#checking the Coefficient of determination metric of our initial Linear model

r2_score(y_test, lin_reg.predict(X_test))

0.14890246319303535

### Question 17:

In [46]:
#creating a function that returns a Pandas DataFrame containing the weight of each feature used by our model (sorted by weight)

def get_weights_df(model, feat, col_name) :
    #this function returns the weight of every feature
    weights = pd.Series(model.coef_, feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = [ 'Features' , col_name]
    weights_df[col_name].round( 3 )
    return weights_df

In [47]:
#calling the get_weights_df() function on our Linear model

get_weights_df(lin_reg, X_train, 'Linear_model_weight')

Unnamed: 0,Features,Linear_model_weight
0,RH_2,-0.456698
1,T_out,-0.32186
2,T2,-0.236178
3,T9,-0.189941
4,RH_8,-0.157595
5,RH_out,-0.077671
6,RH_7,-0.044614
7,RH_9,-0.0398
8,T5,-0.015657
9,T1,-0.003281


### Question 18:

In [36]:
#checking the Root mean squared error metric of our Ridge Regression model

np.sqrt(mean_squared_error(y_test, ridge_reg.predict(X_test)))

0.08753385704628003

### Question 19:

In [48]:
#checking the weights of the features in our Lasso Regression model

get_weights_df(lasso_reg, X_train, 'Lasso_model')

Unnamed: 0,Features,Lasso_model
0,RH_out,-0.049557
1,RH_8,-0.00011
2,T1,0.0
3,Tdewpoint,0.0
4,Visibility,0.0
5,Press_mm_hg,-0.0
6,T_out,0.0
7,RH_9,-0.0
8,T9,-0.0
9,T8,0.0


### Question 20

In [38]:
#checking the Root mean squared error metric of our Lasso Regression model

np.sqrt(mean_squared_error(y_test, lasso_reg.predict(X_test)))

0.09358170467245137