In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

##### Importing Relevant Packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error


In [3]:
def regression_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    rss = np.sum(np.square(y_true - y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    
    metrics = 'RMSE = {}, RSS = {}, MAE = {}'.format(rmse,rss,mae)
    return metrics

##### Reading the Dataset

In [4]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv'
df = pd.read_csv(url)

In [5]:
df.head(3)
df.shape

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668


(19735, 29)

In [6]:
sum(df.isna().sum())

0

In [7]:
# dropping the lights and date columns
df.drop(columns = ['lights'], inplace=True)
df.drop(columns = ['date'], inplace=True)

In [8]:
scaler = MinMaxScaler()

In [9]:
new_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [10]:
new_df.head(2)
new_df.shape

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.32735,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.67729,0.37299,0.097674,0.894737,0.5,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.32735,0.541326,0.225345,0.68214,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.2265,0.678532,0.369239,0.1,0.894737,0.47619,0.894872,0.533937,0.372083,0.372083


(19735, 27)

In [11]:
train = new_df.drop(columns=['Appliances'])

In [12]:
target = new_df.Appliances

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size = 0.3, random_state = 42)

##### Question 2

In [14]:
lasso = Lasso(alpha = 0.001)

In [15]:
lasso.fit(X_train, y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [16]:
len(lasso.coef_)

26

##### Question 6

In [17]:
lr = LinearRegression()

In [18]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
y_pred_linear = lr.predict(X_test)

In [20]:
regression_metrics(y_test, y_pred_linear)

'RMSE = 0.08751444947661711, RSS = 45.347629672663786, MAE = 0.050133623977429535'

##### Question 8

In [21]:
lr.score(X_test, y_test)

0.14890246319303513

##### Question 10

In [22]:
regression_metrics(y_test, y_pred_linear)

'RMSE = 0.08751444947661711, RSS = 45.347629672663786, MAE = 0.050133623977429535'

##### Question 11

In [23]:
y_pred_lasso = lasso.predict(X_test)

In [24]:
regression_metrics(y_test, y_pred_lasso)

'RMSE = 0.09358170467245137, RSS = 51.85336739590869, MAE = 0.055256639821262235'

##### Question 13

In [25]:
ridge = Ridge(alpha = 0.4)

In [26]:
ridge.fit(X_train, y_train)

Ridge(alpha=0.4, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [27]:
y_pred_ridge = ridge.predict(X_test)

In [28]:
regression_metrics(y_test, y_pred_ridge)

'RMSE = 0.08753385704628003, RSS = 45.36774486216903, MAE = 0.050087445840923825'

There's a difference between rmse of Ridge(alpha=0.4) and LinearRegression models but a very small one that can be ignored

##### Question 17

In [29]:
regression_metrics(y_test, y_pred_linear)

'RMSE = 0.08751444947661711, RSS = 45.347629672663786, MAE = 0.050133623977429535'

##### Question 18

In [30]:
round(lr.score(X_test, y_test), 2)

0.15