### Import Dependencies

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

### Load Dataset

In [2]:
energy = pd.read_csv("./energydata_complete.csv")

In [3]:
energy.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


### Question 12

From the dataset, fit a linear model on the relationship between the temperature in the living room in Celsius (x = T2) and the temperature outside the building (y = T6). What is the R^2 value in two d.p.?

In [4]:
q12_data = energy[["T2", "T6"]]

In [5]:
q12_data

Unnamed: 0,T2,T6
0,19.200000,7.026667
1,19.200000,6.833333
2,19.200000,6.560000
3,19.200000,6.433333
4,19.200000,6.366667
...,...,...
19730,25.890000,24.796667
19731,25.754000,24.196667
19732,25.628571,23.626667
19733,25.414000,22.433333


In [7]:
q12_data.isnull().sum()

T2    0
T6    0
dtype: int64

In [11]:
x = q12_data["T2"].values.reshape(-1, 1)
y = q12_data["T6"].values

# Create a linear regression model
q12_data_model = LinearRegression()

# Fit the model to the data
q12_data_model.fit(x, y)

# Predict the dependent variable using the fitted model
y_pred = q12_data_model.predict(x)

# Calculate the R-squared value and round it to two decimal places
r_squared = round(r2_score(y, y_pred), 2)

# Print the R-squared value
print("R-squared value:", r_squared)

R-squared value: 0.64


### Question 13

Normalize the dataset using the MinMaxScaler after removing the following columns: [“date”, “lights”]. The target variable is “Appliances”. Use a 70-30 train-test set split with a random state of 42 (for reproducibility). Run a multiple linear regression using the training set and evaluate your model on the test set. Answer the following questions:

What is the Mean Absolute Error (in two decimal places)?

In [33]:
q13_data = energy.drop(columns=["date", "lights"])

for col in q13_data.columns:
    #find minimum and maximum of that column
    minimum = q13_data[col].min()
    maximum = q13_data[col].max()
    # perform min-max normalization
    q13_data[col] = (q13_data[col]-minimum) / (maximum-minimum)
# Split the dataset into train and test sets
X = q13_data.drop(columns=["Appliances"])
y = q13_data["Appliances"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Create and fit a multiple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the Mean Absolute Error (MAE) on the test set and round it to two decimal places
mae = round(mean_absolute_error(y_test, y_pred), 2)

# Print the Mean Absolute Error (MAE)
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 0.05


### Q14

In [34]:
# Calculate the Residual Sum of Squares (RSS) on the test set and round it to two decimal places
rss = round(mean_squared_error(y_test, y_pred) * len(y_test), 2)

# Print the Residual Sum of Squares (RSS)
print("Residual Sum of Squares (RSS):", rss)

Residual Sum of Squares (RSS): 45.35


### Q15

In [35]:
# Calculate the Root Mean squared error
rmse = round(mean_squared_error(y_test, y_pred, squared=False), 3)

# Print the Root Mean Squared Error (RMSE)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.088


### Q16

In [36]:
# Calculate the coefficient of determination (R-squared) and round it to two decimal places
r_squared = round(r2_score(y_test, y_pred), 2)

# Print the coefficient of determination (R-squared)
print("Coefficient of Determination (R-squared):", r_squared)

Coefficient of Determination (R-squared): 0.15


### Q17

Obtain the feature weights from your linear model above. Which features have the lowest and highest weights respectively?

In [37]:
# feature weights from the linear model
feature_weights = pd.DataFrame({'Feature': X_train.columns, 'Weight': model.coef_})

# Sort the feature weights in ascending order
feature_weights_sorted = feature_weights.sort_values('Weight')

# Get the feature with the lowest weight
lowest_weight_feature = feature_weights_sorted.iloc[0]['Feature']

# Get the feature with the highest weight
highest_weight_feature = feature_weights_sorted.iloc[-1]['Feature']

# Print the features with the lowest and highest weights
print("Feature with the lowest weight:", lowest_weight_feature)
print("Feature with the highest weight:", highest_weight_feature)

Feature with the lowest weight: RH_2
Feature with the highest weight: RH_1


### Q18

Train a ridge regression model with an alpha value of 0.4. Is there any change to the root mean squared error (RMSE) when evaluated on the test set?

In [38]:
# Create and fit a Ridge regression model with alpha = 0.4
ridge_model = Ridge(alpha=0.4)
ridge_model.fit(X_train, y_train)

# Predict on the test set
y_pred = ridge_model.predict(X_test)

# Calculate the Root Mean squared error
rmse = round(mean_squared_error(y_test, y_pred, squared=False), 3)

# Print the Root Mean Squared Error (RMSE)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.088


### Q19

Train a lasso regression model with an alpha value of 0.001 and obtain the new feature weights with it. How many of the features have non-zero feature weights?

In [39]:
# Create and fit a Ridge regression model with alpha = 0.4
lasso_model = Lasso(alpha=0.4)
lasso_model.fit(X_train, y_train)

# Predict on the test set
y_pred = lasso_model.predict(X_test)

# feature weights from the linear model
feature_weights = pd.DataFrame({'Feature': X_train.columns, 'Weight': lasso_model.coef_})

# Sort the feature weights in ascending order
feature_weights_sorted = feature_weights.sort_values('Weight')

feature_weights_sorted

Unnamed: 0,Feature,Weight
0,T1,0.0
23,Tdewpoint,0.0
22,Visibility,0.0
21,Windspeed,0.0
20,RH_out,-0.0
19,Press_mm_hg,-0.0
18,T_out,0.0
17,RH_9,-0.0
16,T9,0.0
15,RH_8,-0.0


### Q20

What is the new RMSE with the lasso regression? (Answer should be in three (3) decimal places)

In [40]:
# Calculate the Root Mean squared error
rmse = round(mean_squared_error(y_test, y_pred, squared=False), 3)

# Print the Root Mean Squared Error (RMSE)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 0.095
