In [55]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [56]:
df = pd.read_csv('energydata_complete.csv')
print(df)

                      date  Appliances  lights         T1       RH_1  \
0      2016-01-11 17:00:00          60      30  19.890000  47.596667   
1      2016-01-11 17:10:00          60      30  19.890000  46.693333   
2      2016-01-11 17:20:00          50      30  19.890000  46.300000   
3      2016-01-11 17:30:00          50      40  19.890000  46.066667   
4      2016-01-11 17:40:00          60      40  19.890000  46.333333   
...                    ...         ...     ...        ...        ...   
19730  2016-05-27 17:20:00         100       0  25.566667  46.560000   
19731  2016-05-27 17:30:00          90       0  25.500000  46.500000   
19732  2016-05-27 17:40:00         270      10  25.500000  46.596667   
19733  2016-05-27 17:50:00         420      10  25.500000  46.990000   
19734  2016-05-27 18:00:00         430      10  25.500000  46.600000   

              T2       RH_2         T3       RH_3         T4  ...         T9  \
0      19.200000  44.790000  19.790000  44.730000  19.0

In [61]:
df_new = df.drop(['date', 'lights'], axis=1)
print(df_new)

       Appliances         T1       RH_1         T2       RH_2         T3  \
0              60  19.890000  47.596667  19.200000  44.790000  19.790000   
1              60  19.890000  46.693333  19.200000  44.722500  19.790000   
2              50  19.890000  46.300000  19.200000  44.626667  19.790000   
3              50  19.890000  46.066667  19.200000  44.590000  19.790000   
4              60  19.890000  46.333333  19.200000  44.530000  19.790000   
...           ...        ...        ...        ...        ...        ...   
19730         100  25.566667  46.560000  25.890000  42.025714  27.200000   
19731          90  25.500000  46.500000  25.754000  42.080000  27.133333   
19732         270  25.500000  46.596667  25.628571  42.768571  27.050000   
19733         420  25.500000  46.990000  25.414000  43.036000  26.890000   
19734         430  25.500000  46.600000  25.264286  42.971429  26.823333   

            RH_3         T4       RH_4         T5  ...         T9     RH_9  \
0      44

In [109]:
linear_reg_df = df[['T2', 'T6']].sample(15, random_state=2)
print(linear_reg_df)

              T2         T6
1117   17.426667  -4.238889
16275  22.760000  14.690000
13272  21.230000   8.926667
3160   21.100000   7.690000
19210  21.856667  10.800000
8260   17.356667   5.000000
12299  19.000000   5.300000
13505  22.700000  17.133333
12913  19.593333   7.545000
17788  21.890000   9.190000
17894  21.500000   5.160000
7665   19.142857   0.937500
10165  18.000000   0.200000
18809  22.400000  16.833333
8737   17.066667   1.900000


In [70]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalised_df = pd.DataFrame(scaler.fit_transform(df_new), columns=df_new.columns)
features_df = normalised_df.drop(columns=['Appliances'])

In [87]:
target_variable = normalised_df['Appliances']
print(Appliances)

0        0.046729
1        0.046729
2        0.037383
3        0.037383
4        0.046729
           ...   
19730    0.084112
19731    0.074766
19732    0.242991
19733    0.383178
19734    0.392523
Name: Appliances, Length: 19735, dtype: float64


In [88]:
x_train, x_test, y_train, y_test = train_test_split(features_df, target_variable, test_size=0.3, random_state=42)

In [89]:
from sklearn import linear_model
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression()
linear_model.fit(x_train, y_train)
predicted_values = linear_model.predict(x_test)

# Mean Absolute Error

In [90]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_test, predicted_values)
round(mae, 2)

0.05

# Residual Sum of Square

In [92]:
rss = np.sum(np.square(y_test - predicted_values))
round(rss, 2)

45.35

# Root Mean Squared Error

In [94]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)

0.088

# Coefficient of Determination

In [96]:
from sklearn.metrics import r2_score

r2_score = r2_score(y_test, predicted_values)
round(r2_score, 2)

0.15

In [95]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=0.4)
ridge_reg.fit(x_train, y_train)

Ridge(alpha=0.4)

In [98]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)

0.088

In [101]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha=0.001)
lasso_reg.fit(x_train, y_train)

Lasso(alpha=0.001)

In [103]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, predicted_values))
round(rmse, 3)

0.088

In [100]:
def get_weights_df(model, feat, col_name):
    weights = pd.Series(model.coef_,feat.columns).sort_values()
    weights_df = pd.DataFrame(weights).reset_index()
    weights_df.columns = ['Features', col_name]
    weights_df[col_name].round(3)
    return weights_df

In [102]:
linear_model_weight = LinearRegression()

linear_model_weights = get_weights_df(linear_model, x_train, 'Linear_Model_Weight')
ridge_weights_df = get_weights_df(ridge_reg, x_train, 'Ridge_Weight')
lasso_weights_df = get_weights_df(lasso_reg, x_train, 'Lasso_weight')
final_weights = pd.merge(linear_model_weights, ridge_weights_df, on='Features')
final_weights = pd.merge(final_weights, lasso_weights_df, on='Features')
print(final_weights)

       Features  Linear_Model_Weight  Ridge_Weight  Lasso_weight
0          RH_2            -0.456698     -0.411071     -0.000000
1         T_out            -0.321860     -0.262172      0.000000
2            T2            -0.236178     -0.201397      0.000000
3            T9            -0.189941     -0.188916     -0.000000
4          RH_8            -0.157595     -0.156830     -0.000110
5        RH_out            -0.077671     -0.054724     -0.049557
6          RH_7            -0.044614     -0.045977     -0.000000
7          RH_9            -0.039800     -0.041367     -0.000000
8            T5            -0.015657     -0.019853     -0.000000
9            T1            -0.003281     -0.018406      0.000000
10          rv1             0.000770      0.000748     -0.000000
11          rv2             0.000770      0.000748     -0.000000
12  Press_mm_hg             0.006839      0.006584     -0.000000
13           T7             0.010319      0.010098     -0.000000
14   Visibility          