In [122]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [97]:
# Load the dataset
pd.set_option("display.max_columns", None)
data = pd.read_csv("energydata_complete.csv")

In [98]:
data.sample(2)

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
4839,2016-02-14 07:30:00,30,0,20.29,40.29,18.6,41.4,21.39,40.745,18.6,41.0,17.89,51.79,2.59,92.0,19.2,39.245,21.144444,46.765,18.2,45.5,2.4,735.2,97.0,1.5,40.0,2.0,5.454801,5.454801
17671,2016-05-13 10:10:00,60,20,24.79,48.2,29.166667,38.596667,27.452857,42.471429,25.62,45.036,24.033333,49.59,20.033333,19.566667,24.79,41.826667,25.347143,46.852857,24.2,46.072,17.3,747.433333,75.666667,3.833333,22.166667,12.933333,38.758229,38.758229


### Explore dataset

In [99]:
data.shape

(19735, 29)

In [100]:
data.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T5           19735 non-null  float64
 12  RH_5         19735 non-null  float64
 13  T6           19735 non-null  float64
 14  RH_6         19735 non-null  float64
 15  T7           19735 non-null  float64
 16  RH_7         19735 non-null  float64
 17  T8           19735 non-null  float64
 18  RH_8         19735 non-null  float64
 19  T9  

### Splitting the data into target variable and predictor

In [102]:
x = data[["T2"]]
x

Unnamed: 0,T2
0,19.200000
1,19.200000
2,19.200000
3,19.200000
4,19.200000
...,...
19730,25.890000
19731,25.754000
19732,25.628571
19733,25.414000


In [103]:
y = data["T6"]
y

0         7.026667
1         6.833333
2         6.560000
3         6.433333
4         6.366667
           ...    
19730    24.796667
19731    24.196667
19732    23.626667
19733    22.433333
19734    21.026667
Name: T6, Length: 19735, dtype: float64

### Split the data into train and test set

In [104]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.3, random_state=42)

### Fitting the model into training data

In [105]:
model = LinearRegression()
model.fit(x_train,y_train)

### Predict the target value

In [106]:
predicted_values = model.predict(x_test)

In [107]:
predicted_values

array([ 2.15578912, 10.01116055,  1.87391554, ...,  4.24758774,
        8.69822311,  4.9893603 ])

### Root mean square

In [109]:
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)

3.63

In [110]:
data = data.drop(columns = ["date", "lights"])

In [111]:
data.sample(2)

Unnamed: 0,Appliances,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
3902,100,21.93,44.2,21.29,41.5,21.633333,43.126667,18.7,41.0,18.89,54.0,6.563333,63.133333,19.1,34.911111,22.5,43.464444,19.1,41.626667,7.366667,747.033333,66.333333,8.0,40.0,1.466667,20.283818,20.283818
18378,50,23.29,41.966667,21.33,44.156,24.323333,38.56,23.066667,39.7,21.7,48.06,14.433333,30.86,23.2,39.326667,23.37,42.7,22.6,42.79,11.8,752.8,81.0,3.0,26.0,8.6,41.541864,41.541864


In [112]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(data)

In [113]:
x = data.drop(columns = ["Appliances"])
x

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,17.166667,55.200000,7.026667,84.256667,17.200000,41.626667,18.2000,48.900000,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,17.166667,55.200000,6.833333,84.063333,17.200000,41.560000,18.2000,48.863333,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,17.166667,55.090000,6.560000,83.156667,17.200000,41.433333,18.2000,48.730000,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,17.166667,55.090000,6.433333,83.423333,17.133333,41.290000,18.1000,48.590000,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,17.200000,55.090000,6.366667,84.893333,17.200000,41.230000,18.1000,48.590000,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,45.590000,23.200000,52.400000,24.796667,1.000000,24.500000,44.500000,24.7000,50.074000,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,45.590000,23.230000,52.326667,24.196667,1.000000,24.557143,44.414286,24.7000,49.790000,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,45.730000,23.230000,52.266667,23.626667,1.000000,24.540000,44.400000,24.7000,49.660000,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,45.790000,23.200000,52.200000,22.433333,1.000000,24.500000,44.295714,24.6625,49.518750,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [114]:
y=data["Appliances"]
y

0         60
1         60
2         50
3         50
4         60
        ... 
19730    100
19731     90
19732    270
19733    420
19734    430
Name: Appliances, Length: 19735, dtype: int64

In [115]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=42)

In [116]:
model = LinearRegression()
model.fit(x_train, y_train)

In [117]:
predicted_values = linear_model.predict(x_test)

In [127]:
# prediction for training set
y_train_pred = model.predict(x_train)

In [131]:
# Mean absolute error for training set
mae = mean_absolute_error(y_train, y_train_pred)
round(mae,3)

53.742

In [136]:
# Root mean squared error for training set
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_train

95.21565985214606

In [137]:
# prediction for test set
y_test_pred = model.predict(x_test)

In [140]:
# Mean absolute error for test set
mae = mean_absolute_error(y_test, y_test_pred)
round(mae,3)

53.643

In [141]:
# Root meam squared error for test
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
rmse_test

93.6404609399803

In [142]:
# Ridge regression
ridge_reg = Ridge(alpha=0.5)
ridge_reg.fit(x_train, y_train)

In [143]:
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
rmse_test

93.6404609399803

In [144]:
# Lasso regression
lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [152]:
# Train a lasso regression model with default value and obtain the new feature weights with it.
# How many of the features have non-zero feature weights
def d_weight_data(model, feat, col_name):
    # this function returns the weight of every feature
    weight = pd.Series(model.coef_, feat.columns).sort_values()
    weight_data = pd.DataFrame(weight).reset_index()
    weight_data.columns=["Features", col_name]
    weight_data[col_name].round(3)
    return weight_data

In [153]:
model_weight=d_weight_data(model, x_train, 'model_weight')
lasso_weight_data = d_weight_data(lasso_reg, x_train, 'Lasso_weight')

feature_weight = pd.merge(model_weight, lasso_weight_data, on='Features')

In [151]:
feature_weight

Unnamed: 0,Features,model_weight,Lasso_weight
0,T9,-21.148452,-21.138361
1,T2,-18.37003,-18.363178
2,RH_2,-13.740748,-13.737231
3,T_out,-11.073628,-11.059566
4,RH_8,-5.778861,-5.778977
5,RH_9,-1.76268,-1.762338
6,RH_7,-1.692787,-1.691936
7,T5,-1.600843,-1.601612
8,RH_out,-1.093521,-1.091186
9,T1,-0.370721,-0.366515
