In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(context="notebook", palette="coolwarm", style = 'darkgrid' ,font_scale = 1.2, color_codes=True)

### Attribute Information:

date time year-month-day hour:minute:second <br>
Appliances, energy use in Wh <br>
lights, energy use of light fixtures in the house in Wh <br>
T1, Temperature in kitchen area, in Celsius <br>
RH_1, Humidity in kitchen area, in % <br>
T2, Temperature in living room area, in Celsius <br>
RH_2, Humidity in living room area, in % <br>
T3, Temperature in laundry room area <br>
RH_3, Humidity in laundry room area, in % <br>
T4, Temperature in office room, in Celsius <br>
RH_4, Humidity in office room, in % <br>
T5, Temperature in bathroom, in Celsius <br>
RH_5, Humidity in bathroom, in % <br>
T6, Temperature outside the building (north side), in Celsius <br>
RH_6, Humidity outside the building (north side), in % <br>
T7, Temperature in ironing room , in Celsius <br>
RH_7, Humidity in ironing room, in % <br>
T8, Temperature in teenager room 2, in Celsius <br>
RH_8, Humidity in teenager room 2, in % <br>
T9, Temperature in parents room, in Celsius <br>
RH_9, Humidity in parents room, in % <br>
To, Temperature outside (from Chievres weather station), in Celsius <br>
Pressure (from Chievres weather station), in mm Hg <br>
RH_out, Humidity outside (from Chievres weather station), in % <br>
Wind speed (from Chievres weather station), in m/s <br>
Visibility (from Chievres weather station), in km <br>
Tdewpoint (from Chievres weather station), Â°C <br>
rv1, Random variable 1, nondimensional <br>
rv2, Random variable 2, nondimensional <br>

In [124]:
# Loading the data into pandas DataFrame
energyData = pd.read_csv('energydata_complete.csv')

energyData.head(2)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195


In [20]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

### Question 12
From the dataset,  
fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) 
and the temperature outside the building (y = T6). <br>
What is the R^2 value in two d.p.?




In [115]:
# selecting the independent variable (X) and the dependent variable (y)
X = energyData.T2.values.reshape(-1,1)
y = energyData.T6

# splitting the data into train and test sets
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=.3, random_state=42)

In [120]:
# Modeling
linear = LinearRegression()
linear.fit(X_tr, y_tr)

#Predicting
predictions = linear.predict(X_te)

#get the R^2 score to two decimal points
r_square = round(r2_score(y_true=y_te, y_pred=predictions),2)

#printing the R^2 score
print("R squared: {}".format(r_square))

R squared: 0.64


### Question 13
Normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set. Answer the following questions:

What is the Mean Absolute Error (in two decimal places)?

In [126]:
#Removing the 'date' and 'lights' columns from the dataset
new_energy_data = energyData.drop(['date', 'lights'], axis=1)
new_energy_data.head(1)

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433


In [128]:
# Normalizing the dataset using MinMaxScaler
scaler = MinMaxScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(new_energy_data), columns=new_energy_data.columns)

#seperating the features from the target variable
features_data = normalized_data.drop(columns='Appliances')
target = normalized_data['Appliances']

In [130]:
#splitting the data into train and test set using sklearn's "train_test_split" function
X_train, X_test, y_train, y_test = train_test_split(features_data,target, test_size=.3, random_state=42)

#Initializing and fitting a linear regression model on the train set
lr_reg = LinearRegression()
lr_reg.fit(X_train, y_train)

#Testing the model by predicting the test set
ypred = lr_reg.predict(X_test)

#Obtaining and printing the Mean Absolute Error to two decimal places
mae = mean_absolute_error(y_test, ypred)
print(f'MAE: {round(mae,2)}')

MAE: 0.05


### Question 14
What is the Residual Sum of Squares (in two decimal places)?


In [131]:
# calculating and printing the Residual Sum of Squares to two decimal places
rss = np.sum(np.square(y_test - ypred))
print(f'Residual Sum of Squares: {round(rss, 2)}') 

Residual Sum of Squares: 45.35


### Question 15
What is the Root Mean Squared Error (in three decimal places)?

In [132]:
#calculating and printing the Root Mean Squared Error score to three decimal places
rmse = np.sqrt(mean_squared_error(y_test, ypred))
print(f'RMSE: {round(rmse, 3)}')

RMSE: 0.088


### Question 16
What is the Coefficient of Determination (in two decimal places)?

In [133]:
#Obtaining the Coefficient of Determination (R-Squared) value using "r2_score" function from sklearn
r_squared = r2_score(y_true=y_test,y_pred=ypred)

print("R squared: {}".format(round(r_squared, 2)))

R squared: 0.15


### Question 17
Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [161]:
def feature_weights(model, features):
    #Obtaining the feature weights of the model

    model_coef = model.coef_ #coefficients of the trained model

    #putting the model coefficients (weights) into a Series where the features are the indexes
    weights = pd.Series(model_coef, features).sort_values(ascending=False)

    #putting the weights (pandas series datatype) into a DataFrame
    weights_df = pd.DataFrame(weights, columns=['Weight'])

    return weights_df

In [162]:
#Obtaining the feature weights of the model
features = features_data.columns
feature_weights(lr_reg, features)

Unnamed: 0,Weight
RH_1,0.553547
T3,0.290627
T6,0.236425
Tdewpoint,0.117758
T8,0.101995
RH_3,0.096048
RH_6,0.038049
Windspeed,0.029183
T4,0.028981
RH_4,0.026386


### Question 18
Train a ridge regression model with an alpha value of 0.4. <br>
Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [92]:
from sklearn.linear_model import Ridge

#initializing and training a Ridge regression model
ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(X_train, y_train)

#Evaluting the model using the test set
ridge_pred = ridge_reg.predict(X_test)

#obtaining the RMSE score
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
round(rmse, 3) 

0.088

#### Observation from the above RMSE
Ridge regression with `alpha = 0.4` produces the same RMSE value of **0.088** as that of Linear model.

### Question 19
Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it.  
How many of the features have non-zero feature weights?

In [163]:
from sklearn.linear_model import Lasso

#initializing and training Lasso regression on the train set
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(X_train, y_train)

#Obtaining the feature weights of the Lasso model
feature_weights(lasso_reg, features) #calling the already created function

Unnamed: 0,Weight
RH_1,0.01788
Windspeed,0.002912
T1,0.0
T7,-0.0
rv1,-0.0
Tdewpoint,0.0
Visibility,0.0
Press_mm_hg,-0.0
T_out,0.0
RH_9,-0.0


### Question 20
What is the new RMSE with the lasso regression? (Answer should be in three (3) decimal places)

In [164]:
#using the above lasso_reg model to make predictions on the test dataset
lasso_pred = lasso_reg.predict(X_test)

#calculating (to 3 decimal places) the RMSE score of the model
rmse = np.sqrt(mean_squared_error(y_test, lasso_pred))
print(f'Lasso RMSE: {round(rmse, 3)}') 

Lasso RMSE: 0.094
