In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import nan

In [2]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

In [4]:
data = pd.read_csv('household_power_consumption.txt', sep = ';',
                  parse_dates = True,
                  low_memory = False)

In [5]:
data.head()

Unnamed: 0,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,16/12/2006,17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
1,16/12/2006,17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2,16/12/2006,17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
3,16/12/2006,17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
4,16/12/2006,17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [6]:
data['date_time'] = data['Date'].str.cat(data['Time'], sep= ' ')
data.drop(['Date', 'Time'], inplace= True, axis = 1)
data.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,date_time
0,4.216,0.418,234.84,18.4,0.0,1.0,17.0,16/12/2006 17:24:00
1,5.36,0.436,233.63,23.0,0.0,1.0,16.0,16/12/2006 17:25:00
2,5.374,0.498,233.29,23.0,0.0,2.0,17.0,16/12/2006 17:26:00
3,5.388,0.502,233.74,23.0,0.0,1.0,17.0,16/12/2006 17:27:00
4,3.666,0.528,235.68,15.8,0.0,1.0,17.0,16/12/2006 17:28:00


In [7]:
data.set_index(['date_time'], inplace=True)
data.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
16/12/2006 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
16/12/2006 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
16/12/2006 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
16/12/2006 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
16/12/2006 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [8]:
data.replace('?', nan, inplace=True)

In [9]:
data = data.astype('float')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2075259 entries, 16/12/2006 17:24:00 to 26/11/2010 21:02:00
Data columns (total 7 columns):
Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dtypes: float64(7)
memory usage: 126.7+ MB


In [11]:
np.isnan(data).sum()

Global_active_power      25979
Global_reactive_power    25979
Voltage                  25979
Global_intensity         25979
Sub_metering_1           25979
Sub_metering_2           25979
Sub_metering_3           25979
dtype: int64

In [12]:
def fill_missing(data):
    one_day = 24*60
    for row in range(data.shape[0]):
        for col in range(data.shape[1]):
            if np.isnan(data[row, col]):
                data[row, col] = data[row-one_day, col]

In [13]:
fill_missing(data.values)

In [14]:
np.isnan(data).sum()

Global_active_power      0
Global_reactive_power    0
Voltage                  0
Global_intensity         0
Sub_metering_1           0
Sub_metering_2           0
Sub_metering_3           0
dtype: int64

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2075259 entries, 16/12/2006 17:24:00 to 26/11/2010 21:02:00
Data columns (total 7 columns):
Global_active_power      float64
Global_reactive_power    float64
Voltage                  float64
Global_intensity         float64
Sub_metering_1           float64
Sub_metering_2           float64
Sub_metering_3           float64
dtypes: float64(7)
memory usage: 126.7+ MB


In [16]:
data.describe()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
count,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0,2075259.0
mean,1.089418,0.1236871,240.8364,4.618401,1.118474,1.291131,6.448635
std,1.054678,0.1125933,3.240051,4.433165,6.14146,5.796922,8.433584
min,0.076,0.0,223.2,0.2,0.0,0.0,0.0
25%,0.308,0.048,238.99,1.4,0.0,0.0,0.0
50%,0.602,0.1,241.0,2.6,0.0,0.0,1.0
75%,1.526,0.194,242.87,6.4,0.0,1.0,17.0
max,11.122,1.39,254.15,48.4,88.0,80.0,31.0


In [17]:
data.shape

(2075259, 7)

In [18]:
data.to_csv('cleaned_data.csv')

In [19]:
dataset = pd.read_csv('cleaned_data.csv', parse_dates = True, index_col = 'date_time', low_memory = False)


In [20]:
dataset.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.84,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.63,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.29,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.74,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.68,15.8,0.0,1.0,17.0


In [21]:
data = dataset.resample('D').sum()

In [22]:
data.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16,1209.176,34.922,93552.53,5180.8,0.0,546.0,4926.0
2006-12-17,3390.46,226.006,345725.32,14398.6,2033.0,4187.0,13341.0
2006-12-18,2203.826,161.792,347373.64,9247.2,1063.0,2621.0,14018.0
2006-12-19,1666.194,150.942,348479.01,7094.0,839.0,7602.0,6197.0
2006-12-20,2225.748,160.998,348923.61,9313.0,0.0,2648.0,14063.0


In [23]:
data_train = data.loc[:'2009-12-31', :]['Global_active_power']
data_train.head()

date_time
2006-12-16    1209.176
2006-12-17    3390.460
2006-12-18    2203.826
2006-12-19    1666.194
2006-12-20    2225.748
Freq: D, Name: Global_active_power, dtype: float64

In [24]:
data_test = data['2010']['Global_active_power']
data_test.head()

date_time
2010-01-01    1224.252
2010-01-02    1693.778
2010-01-03    1298.728
2010-01-04    1687.440
2010-01-05    1320.158
Freq: D, Name: Global_active_power, dtype: float64

In [25]:
data_train.shape

(1112,)

In [26]:
data_test.shape

(345,)

In [27]:
data_train.head(14)

date_time
2006-12-16    1209.176
2006-12-17    3390.460
2006-12-18    2203.826
2006-12-19    1666.194
2006-12-20    2225.748
2006-12-21    1723.288
2006-12-22    2341.338
2006-12-23    4773.386
2006-12-24    2550.012
2006-12-25    2743.120
2006-12-26    3934.110
2006-12-27    1528.760
2006-12-28    2072.638
2006-12-29    3174.392
Freq: D, Name: Global_active_power, dtype: float64

In [28]:
data_train = np.array(data_train)

In [29]:

X_train, y_train = [], []

for i in range(7, len(data_train)-7):
    X_train.append(data_train[i-7:i])
    y_train.append(data_train[i:i+7])

In [31]:
X_train, y_train = np.array(X_train), np.array(y_train)

In [32]:
X_train.shape, y_train.shape

((1098, 7), (1098, 7))

In [35]:
pd.DataFrame(y_train).head()

Unnamed: 0,0,1,2,3,4,5,6
0,4773.386,2550.012,2743.12,3934.11,1528.76,2072.638,3174.392
1,2550.012,2743.12,3934.11,1528.76,2072.638,3174.392,2796.108
2,2743.12,3934.11,1528.76,2072.638,3174.392,2796.108,3494.196
3,3934.11,1528.76,2072.638,3174.392,2796.108,3494.196,2749.004
4,1528.76,2072.638,3174.392,2796.108,3494.196,2749.004,1824.76


In [36]:
x_scaler = MinMaxScaler()
X_train = x_scaler.fit_transform(X_train)

In [37]:
y_scaler = MinMaxScaler()
y_train = y_scaler.fit_transform(y_train)

In [38]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.211996,0.694252,0.431901,0.313037,0.436748,0.32566,0.462304
1,0.694252,0.431901,0.313037,0.436748,0.32566,0.462304,1.0
2,0.431901,0.313037,0.436748,0.32566,0.462304,1.0,0.508439
3,0.313037,0.436748,0.32566,0.462304,1.0,0.508439,0.551133
4,0.436748,0.32566,0.462304,1.0,0.508439,0.551133,0.814446


In [58]:
pd.DataFrame(X_test.reshape(331,7)).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.215329,0.319136,0.231795,0.317735,0.236533,0.225215,0.24291
1,0.319136,0.231795,0.317735,0.236533,0.225215,0.24291,0.067513
2,0.231795,0.317735,0.236533,0.225215,0.24291,0.067513,0.296925
3,0.317735,0.236533,0.225215,0.24291,0.067513,0.296925,0.27736
4,0.236533,0.225215,0.24291,0.067513,0.296925,0.27736,0.250466


In [39]:
X_train = X_train.reshape(1098, 7, 1)

In [60]:
pd.DataFrame(y_train)

Unnamed: 0,0,1,2,3,4,5,6
0,1.000000,0.608483,0.659577,0.974702,0.338269,0.482174,0.773688
1,0.508439,0.659577,0.974702,0.338269,0.482174,0.773688,0.673597
2,0.551133,0.974702,0.338269,0.482174,0.773688,0.673597,0.858305
3,0.814446,0.338269,0.482174,0.773688,0.673597,0.858305,0.661134
4,0.282652,0.482174,0.773688,0.673597,0.858305,0.661134,0.416588
...,...,...,...,...,...,...,...
1093,0.197623,0.472206,0.433326,0.318806,0.702646,0.496729,0.511538
1094,0.394568,0.433326,0.318806,0.702646,0.496729,0.511538,0.514669
1095,0.362081,0.318806,0.702646,0.496729,0.511538,0.514669,0.487852
1096,0.266389,0.702646,0.496729,0.511538,0.514669,0.487852,0.475647


## MODEL

In [40]:
reg = Sequential()
reg.add(LSTM(units = 200, activation = 'relu', input_shape=(7,1)))
reg.add(Dense(7))

In [41]:
reg.compile(loss='mse', optimizer='adam')

In [42]:
reg.fit(X_train, y_train, epochs = 100)

Train on 1098 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100

Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x28d011fdd88>

## TEST

In [43]:
data_test = np.array(data_test)

In [44]:
X_test, y_test = [], []

for i in range(7, len(data_test)-7):
    X_test.append(data_test[i-7:i])
    y_test.append(data_test[i:i+7])

In [45]:

X_test, y_test = np.array(X_test), np.array(y_test)

In [46]:
X_test = x_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)

In [47]:
X_test = X_test.reshape(331,7,1)

In [48]:
y_pred = reg.predict(X_test)

In [49]:
y_pred = y_scaler.inverse_transform(y_pred)

In [50]:
y_pred

array([[1404.5343 , 1408.0128 , 1468.2506 , ..., 1541.1553 , 1510.3818 ,
        1481.7954 ],
       [1057.0338 , 1191.8494 , 1325.114  , ..., 1459.3998 , 1440.8486 ,
        1420.4543 ],
       [1430.7891 , 1393.8259 , 1453.4998 , ..., 1473.2003 , 1436.5037 ,
        1415.1616 ],
       ...,
       [1274.3502 , 1044.4283 , 1055.2831 , ..., 1000.92175, 1003.24054,
         956.94305],
       [1654.1782 , 1396.8729 , 1242.709  , ..., 1178.6206 , 1142.1666 ,
        1131.79   ],
       [1624.2018 , 1388.6733 , 1222.6599 , ..., 1228.025  , 1220.4954 ,
        1234.009  ]], dtype=float32)

In [51]:
y_true = y_scaler.inverse_transform(y_test)

In [52]:
y_true

array([[ 555.664, 1593.318, 1504.82 , ...,    0.   , 1995.796, 2116.224],
       [1593.318, 1504.82 , 1383.18 , ..., 1995.796, 2116.224, 2196.76 ],
       [1504.82 , 1383.18 ,    0.   , ..., 2116.224, 2196.76 , 2150.112],
       ...,
       [1892.998, 1645.424, 1439.426, ..., 1973.382, 1109.574,  529.698],
       [1645.424, 1439.426, 2035.418, ..., 1109.574,  529.698, 1612.092],
       [1439.426, 2035.418, 1973.382, ...,  529.698, 1612.092, 1579.692]])

## Evaluate

In [53]:
def evaluate_model(y_true, y_predicted):
    scores = []
    
    #calculate scores for each day
    for i in range(y_true.shape[1]):
        mse = mean_squared_error(y_true[:, i], y_predicted[:, i])
        rmse = np.sqrt(mse)
        scores.append(rmse)
    
    #calculate score for whole prediction
    total_score = 0
    for row in range(y_true.shape[0]):
        for col in range(y_predicted.shape[1]):
            total_score = total_score + (y_true[row, col] - y_predicted[row, col])**2
    total_score = np.sqrt(total_score/(y_true.shape[0]*y_predicted.shape[1]))
    
    return total_score, scores

In [54]:

evaluate_model(y_true, y_pred)

(574.9018350292386,
 [576.8782722778151,
  586.3945946580905,
  572.3024810223732,
  565.9585568555709,
  579.9201354085528,
  573.1224599518497,
  569.4925305511084])

In [55]:
np.std(y_true[0])

710.0253857243853