In [1]:
%env KERAS_BACKEND=tensorflow
%matplotlib inline

env: KERAS_BACKEND=tensorflow


In [186]:
import pandas as pd
import numpy as np
import glob
import os

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.losses import mean_squared_error

## Load data

In [3]:
# load all csv files of North-point site. Concatenate them..
data_src = "../data"
data_dst = "../output"
all_files = glob.glob(os.path.join(data_src, "np", "*.csv"))
df = pd.concat([pd.read_csv(f, low_memory=False) for f in all_files], ignore_index=True)

# minor changes
df = df.rename(columns={"Time Stamp": "timestamp"})
df = df.replace("\\N", np.nan)

# update data types. object is taken as default
dtypes = dict([(col, np.float64) for col in df.columns])
dtypes["timestamp"] = "datetime64[ns]"
df = df.astype(dtypes)

# change the index to timestamp.
df.index = df.timestamp

**TODO:** Split the time series into chunks. Each chunk breaks when the field value is null. Train all the chunks separately.

In [4]:
# Sample time series. Let's select a period and model for that.. 
ts = df["2017-01-01": "2017-01-7"]

In [5]:
# Preprocessing: Interpolate null values in the `fields`
# Note: This does not eliminate all null values if their continuous sequence > window size.
fields = ["loadsys", "wetbulb", "ct1kw", "ct2kw", "ct3kw", "cwshdr"]
rolling_avg = ts[fields].rolling(10, min_periods=1).mean()

# the time series after some sanitization
ts = ts[fields].fillna(rolling_avg).fillna(method="ffill").dropna()

In [6]:
# split the dataframe for training and validation
# dataframe  := Pandas dataframe
# ratio := Float, training:validation
def train_validation_split(dataframe, ratio):
    size = len(dataframe)
    train_df = dataframe.iloc[0:int(size*ratio)]
    validation_df = dataframe.iloc[int(size*ratio):]
    return train_df, validation_df

train_df, validation_df = train_validation_split(ts, 0.7)

In [226]:
# prepare feature vectors. the hypothesis is that
# y(t) can be determined using x1(k), x2(k), x3(k).... for all k = {t-1, t-2, t-3, ... t-N}, where 0 <= N <= t-1
def prepare_features(dataframe, target_field, N=1):
    x, y = [], []
    for i in range(len(dataframe)-N-1):
        x.append(dataframe.values[i:i+N])
        y.append(dataframe[target_field].values[i+N])
    x = np.array(x)
    y = np.array(y)
    return x, y

train_x, train_y = prepare_features(train_df, target_field="cwshdr", N=5)
validation_x, validation_y = prepare_features(validation_df, target_field="cwshdr", N=5)

# reshape data. combine all data points corresponding to a y(t)
reshape = lambda a: a.reshape((a.shape[0], a.shape[1] * a.shape[2]))
train_x = reshape(train_x)
validation_x = reshape(validation_x)
train_x.shape

(7049, 30)

## Prepare model

In [228]:
model = Sequential([
    LSTM(10, input_shape=(None, train_x.shape[1]), return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    LSTM(20, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    Dense(1)
])

model.compile(loss=mean_squared_error, optimizer="adam", metrics=["accuracy"])

## Train model

In [None]:
# This is how keras wants!!
reshape_x = lambda a: a.reshape((a.shape[0], 1, a.shape[1]))
reshape_y = lambda a: a.reshape((a.shape[0], 1, 1))

history = model.fit(
    x=reshape_x(train_x),
    y=reshape_y(train_y),
    validation_data=(reshape_x(validation_x), reshape_y(validation_y)),
    epochs=10000,
    shuffle=False)

Train on 7049 samples, validate on 3018 samples
Epoch 1/10000
Epoch 2/10000
Epoch 3/10000