# HAMOYE INTERNSHIP
# STABE B CODE SOLUTIONS

## Import Necessary Dependencies

In [71]:
import pandas as pd # data manipulations

import numpy as np # scientific computing 

from sklearn.linear_model import LinearRegression, Lasso, Ridge # regression models

from sklearn.model_selection import train_test_split # for splitting datasets into training and test sets

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error # model evalution

from sklearn.preprocessing import MinMaxScaler # scaling

## Read the energy data into a dataframe

In [72]:
appliances_df = pd.read_csv("energydata_complete.csv")

appliances_df.head() # display the first 5 rows

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


## Get a summary information about the dataframe

In [82]:
appliances_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

## Create a helper function to evaluate the performance of a regression model

**This function will be invoked subsequently to calculate metrics such as R^2, MAE, e.t.c**

In [49]:
def performance(y_test, yhat):
    
    metrics = {} # instantiate an empty dictonary
    
    metrics["R2-Score"] = r2_score(y_test, yhat) # calculate r2 and add it to the dictionary
    
    metrics["MAE"] = mean_absolute_error(y_test, yhat) # calculate MAE and add it to the dictionary
    
    metrics["RSS"] = np.sum(np.square(y_test - yhat)) # calculate RSS and add it to the dictionary
    
    metrics["RMSE"] = np.sqrt(mean_squared_error(y_test, yhat)) # calculate RMSE and add it to the dictionary
    
    return metrics

## Preprocessing

### Drop the date and lights features from the dataset

In [4]:
appliances_df.drop(columns=["date", "lights"], inplace=True)
appliances_df.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,60,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,50,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,50,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,60,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


### Normalize the dataset using MinMaxScaler

In [5]:
scaler = MinMaxScaler() # create a MinMaxScaler object

norm_appliances = pd.DataFrame(scaler.fit_transform(appliances_df), columns=appliances_df.columns)

norm_appliances.head()

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,0.381691,0.38107,0.841827,0.170594,0.653428,0.173329,0.661412,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,0.381691,0.375443,0.839872,0.170594,0.651064,0.173329,0.660155,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.32735,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,0.380037,0.367487,0.830704,0.170594,0.646572,0.173329,0.655586,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.32735,0.52408,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,0.380037,0.3638,0.833401,0.16431,0.641489,0.164175,0.650788,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.32735,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,0.380037,0.361859,0.848264,0.170594,0.639362,0.164175,0.650788,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611


# Question 12

**From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two D.P.**

# Answer: 0.64

## Solution:

In [75]:
X = norm_appliances[["T2"]] # define predictor
y = norm_appliances["T6"] # define target

# split the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)

simple_linear_model = LinearRegression() # create a Linear Regression model

simple_linear_model.fit(train_X, train_y) # train the model

yhat = simple_linear_model.predict(test_X) # make predictions with the model

**Determine the R^2 of the model**

In [76]:
round(r2_score(test_y, yhat), 2) # evaluate R^2 and round it to 2 d.p.

0.64

# Question 13 - 17

### Create a Linear Regression model and fit the training data to it

In [78]:
X = norm_appliances.drop(columns="Appliances") # define predictors; all features except "Appliances" are used
y = norm_appliances["Appliances"] # define target variable

# split the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)

linear_model = LinearRegression() # create a linear regression model

linear_model.fit(train_X, train_y) # train the model

yhat = linear_model.predict(test_X) # make predictions with the model

# Question 13

**What is the Mean Absolute Error (in two decimal places)?**

# Answer: 0.05

## Solution:

In [50]:
round(performance(test_y, yhat)["MAE"], 2)

0.05

# Question 14

**What is the Residual Sum of Squares (in two decimal places)?**

# Answer: 45.35

## Solution:

In [51]:
round(performance(test_y, yhat)["RSS"], 2)

45.35

# Question 15

**What is the Root Mean Squared Error (in three decimal places)?**

# Answer: 0.088

## Solution:

In [52]:
round(performance(test_y, yhat)["RMSE"], 3)

0.088

# Question 16

**What is the Coefficient of Determination (in two decimal places)?**

# Answer: 0.15

## Solution:

In [55]:
round(performance(test_y, yhat)["R2-Score"], 2)

0.15

# Question 17

**Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?**

# Answer: RH_2, RH_1

## Solution:

**Obtain the feature weights and restructure it into a Series "weights"**

In [80]:
weights = pd.Series(linear_model.coef_, X.columns)
weights # display the weights Series

T1            -0.003281
RH_1           0.553547
T2            -0.236178
RH_2          -0.456698
T3             0.290627
RH_3           0.096048
T4             0.028981
RH_4           0.026386
T5            -0.015657
RH_5           0.016006
T6             0.236425
RH_6           0.038049
T7             0.010319
RH_7          -0.044614
T8             0.101995
RH_8          -0.157595
T9            -0.189941
RH_9          -0.039800
T_out         -0.321860
Press_mm_hg    0.006839
RH_out        -0.077671
Windspeed      0.029183
Visibility     0.012307
Tdewpoint      0.117758
rv1            0.000770
rv2            0.000770
dtype: float64

**Sort the weight Series in ascending order and retrieve the first and last elements**

In [81]:
weights.sort_values()[[0, -1]]

RH_2   -0.456698
RH_1    0.553547
dtype: float64

# Question 18

**Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?**

# Answer: No. The RMSE of the linear and ridge models are the same to 3 d.p.

## Solution:

In [62]:
ridge = Ridge(alpha=0.4) # create a ridge regression model

ridge.fit(train_X, train_y) # fit the model with the training data

yhat = ridge.predict(test_X) # make predictions on the test set

round(performance(test_y, yhat)["RMSE"], 3) # obtain the RMSE by invoking the performance function

0.088

# Question 19 - 20

# Question 19

**Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights**

# Answer: 4

## Solution:

In [63]:
lasso = Lasso(alpha=0.001) # create a lasso regression model

lasso.fit(train_X, train_y) # train the model

weights = pd.Series(lasso.coef_, X.columns) # obtain the feature weights

**Get the number of features with non-zero weights**

In [66]:
len(weights[weights != 0])

4

# Question 20

**What is the new RMSE with the Lasso Regression (in 3 decimal places)?**

# Answer: 0.094

## Solution:

In [70]:
yhat = lasso.predict(test_X) # make predictions with the model

round(performance(test_y, yhat)["RMSE"], 3) # determine the RMSE using the helper function and round it to 3 d.p

0.094

# Notebook by: OLALEKE, MOSHOOD ADEGBOYEGA