In [3]:
import svgwrite
import numpy as np
import time
import random
import pickle
import codecs
import collections
import os
import math
import json
import tensorflow as tf
from six.moves import xrange
import pandas as pd

In [4]:
data = np.load("cloud.npz", encoding="latin1", allow_pickle=True)
train = data["train"]

In [77]:
from typing import List, Tuple
import numpy as np
from IPython.display import clear_output
from sklearn.preprocessing import MinMaxScaler

def create_window(input_data: np.array, window_size: int) -> np.array:
    data_len = len(input_data)
    result = np.zeros((data_len-window_size+1, window_size, *input_data.shape[1:]))
    for i in range(data_len):
        if i+window_size <= data_len:
            result[i] = input_data[i:i+window_size]
    return result


def create_window_on_multiple_samples(input_data: np.array, window_size: int) -> np.array:
    """
    Similar to create_window, but now can take multiple samples, will output in one
    giant windowed np.array.
    """
    windowed_data = []
    for i, sample in enumerate(input_data):
#         print(f"sample = {sample[:,:2]}")
#         break
#         windowed_data.append(create_window(sample[:,:2], window_size))
        windowed_data.append(create_window(sample, window_size))

        if i % 10000 == 0:
            print(f"Now at {i}")
            clear_output(wait=True)
    result = np.concatenate(windowed_data)
    print(f"Done processing {i} samples, total of {result.shape[0]} windows and {result.shape[0] * result.shape[1]} datapoints")
    return np.concatenate(windowed_data)


def split_train_test(input_data: List) -> Tuple[np.array, np.array]:
#     return input_data[:,0:-1], input_data[:,-1]
    return input_data[:,:-1], input_data[:,-1]

def normalise_windows(window_data):
    # A support function to normalize a dataset
    scaler = MinMaxScaler()
    normalised_data = []
    for window in window_data:
        scaled = scaler.fit_transform(window)
        normalised_data.append(scaled)
    return np.concatenate(normalised_data)

In [78]:
x = create_window_on_multiple_samples(train, 5)

Done processing 69999 samples, total of 3882224 windows and 19411120 datapoints


In [79]:
X, Y = split_train_test(x)

In [85]:
print(f"X = {X[:2]} Y = {Y[:2]} \t X shape = {X.shape[0]}")

X = [[[  1. -42.   0.]
  [  5. -10.   0.]
  [ 28. -21.   0.]
  [ 29.  -8.   0.]]

 [[  5. -10.   0.]
  [ 28. -21.   0.]
  [ 29.  -8.   0.]
  [ 47.   1.   0.]]] Y = [[47.  1.  0.]
 [19. 10.  0.]] 	 X shape = 3882224


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from matplotlib import pyplot

# Initialize LSTM
model = Sequential()
# print (train_set[3].shape)
model.add(LSTM(units=50, return_sequences=True,
     input_shape=(X.shape[0], 3)))

# Adding a second LSTM layer and Dropout regularisation
model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

# Adding a third LSTM layer and Dropout regularisation
model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

# # Adding a fourth LSTM layer and Dropout regularisation
model.add(LSTM(units = 50))
model.add(Dropout(0.2))

# Adding the output layer
model.add(Dense(units = 1))

# Compiling the RNN
model.compile(optimizer = 'adam', loss = 'mse')
# history = model.fit(X, Y, epochs=10, validation_data=(test_set, valid_set), shuffle=False)
print(model.summary())
# model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
history = model.fit(X, Y, epochs = 200)

# plot train and validation loss
pyplot.plot(history.history['loss'])
pyplot.plot(history.history['val_loss'])
pyplot.title('model train vs validation loss')
pyplot.ylabel('loss')
pyplot.xlabel('epoch')
pyplot.legend(['train', 'validation'], loc='upper right')
pyplot.show()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_28 (LSTM)               (None, 3882224, 50)       10800     
_________________________________________________________________
lstm_29 (LSTM)               (None, 3882224, 50)       20200     
_________________________________________________________________
dropout_15 (Dropout)         (None, 3882224, 50)       0         
_________________________________________________________________
lstm_30 (LSTM)               (None, 3882224, 50)       20200     
_________________________________________________________________
dropout_16 (Dropout)         (None, 3882224, 50)       0         
_________________________________________________________________
lstm_31 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dropout_17 (Dropout)         (None, 50)              

In [None]:
print(f"X = {X.shape} \t Y = {Y.shape}")