In [103]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_absolute_error as MAE, mean_squared_error as MSE, r2_score

In [104]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv")

In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two d.p.?

In [106]:
scaler = MinMaxScaler()

In [107]:
X, y = data[["T2"]], data["T6"]

X_scaled = scaler.fit_transform(X)

lin_model = LinearRegression().fit(X_scaled, y)

print(round(lin_model.score(X_scaled, y), 2))

0.64


#### Normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set. Answer the following questions:

In [108]:
X = data.drop(columns=["date", "lights", "Appliances"])
y = data["Appliances"]

# normalize the data
X_scaled = scaler.fit_transform(X)

# split the data 70:30
X_train, X_test, y_train, y_test = tts(X_scaled, y, test_size=0.3, random_state=42)

# run multiple linear regression
lin_model = LinearRegression().fit(X_train, y_train)

# predict X_test
y_pred = lin_model.predict(X_test)

What is the Mean Absolute Error (in two decimal places)?

In [109]:
# evaluate with MAE
print(round( MAE(y_test, y_pred), 2 ))

53.64


What is the Residual Sum of Squares (in two decimal places)?

In [110]:
# evaluate with RSS

def RSS(y_test, y_pred):
    return np.sum( np.square(y_test - y_pred) )

print(round( RSS(y_test, y_pred), 2 ))

51918501.21


What is the Root Mean Squared Error (in three decimal places)?

In [111]:
# evaluate with RMSE
print(round( MSE(y_test, y_pred, squared= False), 3 ))

93.64


What is the Coefficient of Determination (in two decimal places)?

In [112]:
# evaluate with r2_score or with model.score()
print(round( r2_score(y_test, y_pred), 2 ))

0.15


Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [113]:
weights = list(zip(lin_model.coef_, X.columns))
print(min(weights), max(weights))

(-488.66680472219576, 'RH_2') (592.294861827345, 'RH_1')


Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [114]:
# train with ridge regression
ridge = ridge = Ridge(alpha=0.4, random_state=42).fit(X_train, y_train)

# predict X_test
y_pred = ridge.predict(X_test)

# evaluate with RMSE
print(round( MSE(y_test, y_pred, squared= False), 3 ), "\nAs you can see there is a slight difference")

93.661 
As you can see there is a slight difference


Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?

In [115]:
# train with ridge regression
lasso = Lasso(alpha=0.001, random_state=42).fit(X_train, y_train)

# determine the number of feature weights that are not zero
print((lasso.coef_ != 0).sum())

25


What is the new RMSE with the lasso regression? (Answer should be in three (3) decimal places)

In [116]:
# predict X_test
y_pred = lasso.predict(X_test)

# evaluate with RMSE
print(round( MSE(y_test, y_pred, squared= False), 3 ))

93.641
