In [159]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

In [123]:
# importing my dataset
df = pd.read_csv('energydata_complete.csv');

In [124]:
# INFORMATIONS ON THE COLUMNS OF THE DATASET


# •⁠  ⁠⁠ date ⁠: Date and time of the observation
# •⁠  ⁠⁠ Appliances ⁠: Energy consumption of appliances (dependent variable)
# •⁠  ⁠⁠ lights ⁠: Energy consumption of lights (dependent variable)
# •⁠  ⁠⁠ T1 ⁠: Temperature in room 1
# •⁠  ⁠⁠ RH_1 ⁠: Relative humidity in room 1
# •⁠  ⁠⁠ T2 ⁠: Temperature in room 2
# •⁠  ⁠⁠ RH_2 ⁠: Relative humidity in room 2
# •⁠  ⁠⁠ T3 ⁠: Temperature in room 3
# •⁠  ⁠⁠ RH_3 ⁠: Relative humidity in room 3
# •⁠  ⁠⁠ T4 ⁠: Temperature in room 4
# •⁠  ⁠⁠ RH_4 ⁠: Relative humidity in room 4
# •⁠  ⁠⁠ T5 ⁠: Temperature in room 5
# •⁠  ⁠⁠ RH_5 ⁠: Relative humidity in room 5
# •⁠  ⁠⁠ T6 ⁠: Temperature outside the building (north side)
# •⁠  ⁠⁠ RH_6 ⁠: Relative humidity outside the building (north side)
# •⁠  ⁠⁠ T7 ⁠: Temperature in ironing room
# •⁠  ⁠⁠ RH_7 ⁠: Relative humidity in ironing room
# •⁠  ⁠⁠ T8 ⁠: Temperature in teenager room 2
# •⁠  ⁠⁠ RH_8 ⁠: Relative humidity in teenager room 2
# •⁠  ⁠⁠ T9 ⁠: Temperature in parent's room
# •⁠  ⁠⁠ RH_9 ⁠: Relative humidity in parent's room
# •⁠  ⁠⁠ T_out ⁠: Temperature outside (from Chievres weather station)
# •⁠  ⁠⁠ Press_mm_hg ⁠: Pressure (from Chievres weather station)
# •⁠  ⁠⁠ RH_out ⁠: Relative humidity outside (from Chievres weather station)
# •⁠  ⁠⁠ Windspeed ⁠: Windspeed (from Chievres weather station)
# •⁠  ⁠⁠ Visibility ⁠: Visibility (from Chievres weather station)
# •⁠  ⁠⁠ Tdewpoint ⁠: Dewpoint temperature (from Chievres weather station)
# •⁠  ⁠⁠ rv1 ⁠: Random variable 1
# •⁠  ⁠⁠ rv2 ⁠: Random variable 2


In [125]:
# EXPLORING MY DATASET
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [126]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         19735 non-null  datetime64[ns]
 1   Appliances   19735 non-null  int64         
 2   lights       19735 non-null  int64         
 3   T1           19735 non-null  float64       
 4   RH_1         19735 non-null  float64       
 5   T2           19735 non-null  float64       
 6   RH_2         19735 non-null  float64       
 7   T3           19735 non-null  float64       
 8   RH_3         19735 non-null  float64       
 9   T4           19735 non-null  float64       
 10  RH_4         19735 non-null  float64       
 11  T5           19735 non-null  float64       
 12  RH_5         19735 non-null  float64       
 13  T6           19735 non-null  float64       
 14  RH_6         19735 non-null  float64       
 15  T7           19735 non-null  float64       
 16  RH_7

In [127]:
df.describe()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,2016-03-20 05:30:00,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,...,19.485828,41.552401,7.411665,755.522602,79.750418,4.039752,38.330834,3.760707,24.988033,24.988033
min,2016-01-11 17:00:00,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,2016-02-14 23:15:00,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,...,18.0,38.5,3.666667,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,2016-03-20 05:30:00,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,...,19.39,40.9,6.916667,756.1,83.666667,3.666667,40.0,3.433333,24.897653,24.897653
75%,2016-04-23 11:45:00,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,...,20.6,44.338095,10.408333,760.933333,91.666667,5.5,40.0,6.566667,37.583769,37.583769
max,2016-05-27 18:00:00,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653
std,,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,...,2.014712,4.151497,5.317409,7.399441,14.901088,2.451221,11.794719,4.194648,14.496634,14.496634


In [128]:
# CHECKING FOR MISSING DATA's

df.isnull().sum()

date           0
Appliances     0
lights         0
T1             0
RH_1           0
T2             0
RH_2           0
T3             0
RH_3           0
T4             0
RH_4           0
T5             0
RH_5           0
T6             0
RH_6           0
T7             0
RH_7           0
T8             0
RH_8           0
T9             0
RH_9           0
T_out          0
Press_mm_hg    0
RH_out         0
Windspeed      0
Visibility     0
Tdewpoint      0
rv1            0
rv2            0
dtype: int64

DATA PROCESSING

In [180]:
# SPLITING THE DATA INTO TARGET AND FEATURES
y = df['Appliances']
x = df.drop(['lights', 'date'], axis=1)

In [198]:
# TRAIN_TEST SPLITS

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)
y_train_scaled = scaler.fit_transform([y_train])
y_test_scaled = scaler.fit_transform([y_test])

MODEL TRAINING


In [199]:
# FIT MODEL

model = LinearRegression()

model.fit(x_train_scaled, y_train)

MODEL EVALUATION

In [200]:
# PREDICTION

y_pred = model.predict(x_train_scaled)

In [196]:
# EVALUATION METRICS

# mse = mean_squared_error(y_test, y_pred, squared=False)
# r2  = r2_score(y_test, y_pred)
mae = mean_absolute_error(x_test_scaled, y_pred)

# print(f"MSE => {round(mse, 2)}")
# print(f"R2 => {round(r2, 2)}")
print(f"MAE => {round(mae, 3)}")

ValueError: Found input variables with inconsistent numbers of samples: [5921, 13814]

EXPORT TRAINED MODEL

In [None]:
# This will be done later