In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib_inline

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge

## Dataset Description
  ### Appliances Energy Prediction Dataset

The dataset for the remainder of this quiz is the Appliances Energy Prediction data. The data set is at 10 min for about 4.5 months. The house temperature and humidity conditions were monitored with a ZigBee wireless sensor network. Each wireless node transmitted the temperature and humidity conditions around 3.3 min. Then, the wireless data was averaged for 10 minutes periods. The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis (rp5.ru), and merged together with the experimental data sets using the date and time column. Two random variables have been included in the data set for testing the regression models and to filter out non predictive attributes (parameters). The attribute information can be seen below.

#### Attribute Information:

* Date, time year-month-day hour:minute:second

* Appliances, energy use in Wh

* lights, energy use of light fixtures in the house in Wh

* T1, Temperature in kitchen area, in Celsius

* RH_1, Humidity in kitchen area, in %

* T2, Temperature in living room area, in Celsius

* RH_2, Humidity in living room area, in %

* T3, Temperature in laundry room area

* RH_3, Humidity in laundry room area, in %

* T4, Temperature in office room, in Celsius

* RH_4, Humidity in office room, in %

* T5, Temperature in bathroom, in Celsius

* RH_5, Humidity in bathroom, in %

* T6, Temperature outside the building (north side), in Celsius

* RH_6, Humidity outside the building (north side), in %

* T7, Temperature in ironing room , in Celsius

* RH_7, Humidity in ironing room, in %

* T8, Temperature in teenager room 2, in Celsius

* RH_8, Humidity in teenager room 2, in %

* T9, Temperature in parents room, in Celsius

* RH_9, Humidity in parents room, in %

* To, Temperature outside (from Chievres weather station), in Celsius

* Pressure (from Chievres weather station), in mm Hg

* RH_out, Humidity outside (from Chievres weather station), in %

* Wind speed (from Chievres weather station), in m/s

* Visibility (from Chievres weather station), in km

* Tdewpoint (from Chievres weather station), Â°C

* rv1, Random variable 1, nondimensional

* rv2, Random variable 2, nondimensional

In [3]:
df = pd.read_csv('energydata_complete.csv')
df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [4]:
df.shape

(19735, 29)

In [5]:
df.corr().transpose()

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
Appliances,1.0,0.197278,0.055447,0.086031,0.120073,-0.060465,0.08506,0.036292,0.040281,0.016965,...,0.01001,-0.051462,0.099155,-0.034885,-0.152282,0.087122,0.00023,0.015353,-0.011145,-0.011145
lights,0.197278,1.0,-0.023528,0.106968,-0.005622,0.050985,-0.097393,0.131161,-0.008859,0.114936,...,-0.157592,-0.008766,-0.074424,-0.010576,0.068543,0.060281,0.020038,-0.036322,0.000521,0.000521
T1,0.055447,-0.023528,1.0,0.164006,0.836834,-0.002509,0.892402,-0.02855,0.877001,0.097861,...,0.844777,0.071756,0.682846,-0.150574,-0.345481,-0.087654,-0.07621,0.571309,-0.006203,-0.006203
RH_1,0.086031,0.106968,0.164006,1.0,0.269839,0.797535,0.25323,0.844677,0.10618,0.880359,...,0.115263,0.764001,0.340767,-0.293957,0.274126,0.204932,-0.021057,0.639106,-0.000699,-0.000699
T2,0.120073,-0.005622,0.836834,0.269839,1.0,-0.16561,0.735245,0.121497,0.762066,0.231563,...,0.675535,0.157346,0.792255,-0.133028,-0.505291,0.052495,-0.069721,0.582602,-0.011087,-0.011087
RH_2,-0.060465,0.050985,-0.002509,0.797535,-0.16561,1.0,0.137319,0.678326,-0.047304,0.721435,...,0.054544,0.676467,0.033674,-0.255646,0.584911,0.06919,-0.005368,0.499152,0.006275,0.006275
T3,0.08506,-0.097393,0.892402,0.25323,0.735245,0.137319,1.0,-0.011234,0.852778,0.122737,...,0.901324,0.134602,0.699417,-0.189974,-0.281718,-0.100776,-0.10231,0.645886,-0.005194,-0.005194
RH_3,0.036292,0.131161,-0.02855,0.844677,0.121497,0.678326,-0.011234,1.0,-0.140457,0.898978,...,-0.19527,0.833538,0.118207,-0.233274,0.356192,0.263188,0.017041,0.414387,-0.000477,-0.000477
T4,0.040281,-0.008859,0.877001,0.10618,0.762066,-0.047304,0.852778,-0.140457,1.0,-0.04865,...,0.889439,-0.025549,0.663478,-0.075292,-0.388602,-0.185747,-0.104768,0.519471,-0.001815,-0.001815
RH_4,0.016965,0.114936,0.097861,0.880359,0.231563,0.721435,0.122737,0.898978,-0.04865,1.0,...,-0.044518,0.856591,0.293289,-0.250748,0.336813,0.300192,0.002636,0.616509,-0.001787,-0.001787


In [6]:
X =df['T2'].values.reshape(-1,1)
y= df['T6']

In [7]:
#create the linear regression model
model = LinearRegression()

In [8]:
#fit the model
model.fit(X,y)

LinearRegression()

In [9]:

y_pred = model.predict(X)

In [10]:
r2 =r2_score(y,y_pred)
r2

0.6418990830855492

In [11]:
X= df.drop(['date','lights','Appliances'],axis=1)
y = df.Appliances

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=42)

In [13]:
scaler = MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler()

In [14]:
X_train= scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
lr = LinearRegression()
lasso = Lasso(alpha=0.001)
ridge = Ridge(alpha=0.4)

In [16]:
def create_model(model,dp):
    model = model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    MAE = np.round(mean_absolute_error(y_test,y_pred),dp)
    print('MAE: {}'.format(MAE))
    RRS = np.round(mean_squared_error(y_test,y_pred),dp)
    print('RRS: {}'.format(RRS))
    RMSE = np.round(mean_squared_error(y_test,y_pred,squared=False),dp)
    print('RMSE: {}'.format(RMSE))
    r2 = np.round(r2_score(y_test,y_pred),dp)
    print('R2: {}'.format(r2))
    coef_df = pd.DataFrame(index=X.columns,data=model.coef_,columns=['coef']).sort_values(by='coef')
    return coef_df
    

In [17]:
create_model(lr,2)

MAE: 53.64
RRS: 8768.54
RMSE: 93.64
R2: 0.15


Unnamed: 0,coef
RH_2,-469.521362
T_out,-344.389845
T2,-252.710373
T9,-203.236627
RH_8,-168.627161
RH_out,-83.107599
RH_7,-47.563552
RH_9,-42.586339
T5,-16.752822
T1,-3.510725


In [18]:
create_model(ridge,2)

MAE: 53.59
RRS: 8772.2
RMSE: 93.66
R2: 0.15


Unnamed: 0,coef
RH_2,-428.648491
T_out,-280.234677
T2,-219.668255
T9,-201.608073
RH_8,-167.860585
RH_out,-58.10719
RH_7,-48.695371
RH_9,-43.976024
T5,-21.051856
T1,-17.321012


In [19]:
create_model(lasso,3)

MAE: 53.63
RRS: 8768.576
RMSE: 93.641
R2: 0.149


Unnamed: 0,coef
RH_2,-465.82105
T_out,-332.573565
T2,-249.638653
T9,-202.480809
RH_8,-168.697937
RH_out,-77.988779
RH_7,-47.20961
RH_9,-42.446053
T5,-17.113012
T1,-4.055859
