# Model creation and data split-up

This notebook will:
* Split data ito test and train data
* Train the model
* Save the model

Import packages

In [2]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import pandas as pds
import datetime as dt
from collections import deque
import random

Import data

In [3]:
data = pds.read_pickle('ANN_dataset.pkl')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,timestamp,event,vehicle_id,line,lon,lat,direction,speed,station,journey,seg,time_left,segment_time,tsjs
0,1,2018-02-16T04:48:40.0000000+01:00,ObservedPositionEvent,5432,0,58.414238,15.571015,147.300003,0.0,,1,1,71.0,71.0,0.0
1,3,2018-02-16T04:48:41.0000000+01:00,ObservedPositionEvent,5432,0,58.414246,15.571012,147.300003,0.0,,1,1,70.0,71.0,0.0
2,4,2018-02-16T04:48:42.0000000+01:00,ObservedPositionEvent,5432,0,58.414249,15.571008,147.300003,0.0,,1,1,69.0,71.0,0.0
3,5,2018-02-16T04:48:43.0000000+01:00,ObservedPositionEvent,5432,0,58.414257,15.571004,147.300003,0.0,,1,1,68.0,71.0,0.0
4,6,2018-02-16T04:48:44.0000000+01:00,ObservedPositionEvent,5432,0,58.414257,15.571006,147.300003,0.0,,1,1,67.0,71.0,0.0


One hot encode segments, normalize columns. time_left is the label in this model.

In [5]:
df = data
dataset = pds.DataFrame()
# Make segments categorical
dataset = pds.get_dummies(df['seg'])

# Combine segments, timestamps and time from journey start

# Cyclical timestamps 

dataset['hr_sin'] = np.sin((pds.to_datetime(df.timestamp).dt.hour*3600+pds.to_datetime(df.timestamp).dt.minute*60+pds.to_datetime(df.timestamp).dt.second)*(2.*np.pi/86400))
dataset['hr_cos'] = np.cos((pds.to_datetime(df.timestamp).dt.hour*3600+pds.to_datetime(df.timestamp).dt.minute*60+pds.to_datetime(df.timestamp).dt.second)*(2.*np.pi/86400))

# Convert to radians befor trigonometric functions
dataset['dir_sin'] = np.sin(df.direction*(np.pi/180))
dataset['dir_cos'] = np.cos(df.direction*(np.pi/180))

dataset['speed'] = (df['speed']-df['speed'].min())/(df['speed'].max()-df['speed'].min())
# Previously was 'time_since_journey_start'
dataset['tsjs'] = (df['tsjs']-df['tsjs'].min())/(df['tsjs'].max()-df['tsjs'].min())

dataset['lat'] = (df['lat']-df['lat'].min())/(df['lat'].max()-df['lat'].min())
dataset['lon'] = (df['lon']-df['lon'].min())/(df['lon'].max()-df['lon'].min())

# Include journey number to select entire journeys
dataset['journey'] = df['journey']

dataset['time_left'] = df['time_left']



dataset.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,hr_sin,hr_cos,dir_sin,dir_cos,speed,tsjs,lat,lon,journey,time_left
0,1,0,0,0,0,0,0,0,0,0,...,0.840251,0.542197,0.54024,-0.841511,0.0,0.0,0.326317,0.924521,1,71.0
1,1,0,0,0,0,0,0,0,0,0,...,0.840291,0.542136,0.54024,-0.841511,0.0,0.0,0.326237,0.925287,1,70.0
2,1,0,0,0,0,0,0,0,0,0,...,0.84033,0.542075,0.54024,-0.841511,0.0,0.0,0.326156,0.92567,1,69.0
3,1,0,0,0,0,0,0,0,0,0,...,0.84037,0.542014,0.54024,-0.841511,0.0,0.0,0.326076,0.926437,1,68.0
4,1,0,0,0,0,0,0,0,0,0,...,0.840409,0.541953,0.54024,-0.841511,0.0,0.0,0.326116,0.926437,1,67.0


Divide training/validation/test data set

In [6]:
# Roughly 20% Test and 20% Validation
num_test_segments = 700
num_validation_segments = 700
unique_journeys = dataset.journey.unique()
random.shuffle(unique_journeys)

msk1 = unique_journeys[0:num_test_segments]
msk2 = unique_journeys[num_test_segments:num_test_segments+num_validation_segments]
msk3 = unique_journeys[num_test_segments+num_validation_segments:len(unique_journeys)]
mask1 = dataset['journey'].isin(msk1)
mask2 = dataset['journey'].isin(msk2)
mask3 = dataset['journey'].isin(msk3)

In [7]:
test_input = dataset[mask1]
validation_input = dataset[mask2]
train_input = (dataset[mask3])


In [19]:
test_idxs = test_input.index

In [8]:
# Journey numbers were needed up until this point
train_data = train_input.drop(columns=['journey'])
test_data = test_input.drop(columns=['journey'])
validation_data = validation_input.drop(columns=['journey'])

train_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,hr_sin,hr_cos,dir_sin,dir_cos,speed,tsjs,lat,lon,time_left
641,1,0,0,0,0,0,0,0,0,0,0,0.898794,0.438371,0.543174,-0.83962,0.0,0.0,0.326076,0.923372,191.0
642,1,0,0,0,0,0,0,0,0,0,0,0.898826,0.438306,0.543174,-0.83962,0.0,0.0,0.326076,0.923372,190.0
643,1,0,0,0,0,0,0,0,0,0,0,0.898858,0.43824,0.543174,-0.83962,0.0,0.0,0.326096,0.923372,189.0
644,1,0,0,0,0,0,0,0,0,0,0,0.89889,0.438175,0.543174,-0.83962,0.0,0.0,0.326096,0.923755,188.0
645,1,0,0,0,0,0,0,0,0,0,0,0.898922,0.43811,0.543174,-0.83962,0.0,0.0,0.326096,0.923755,187.0


Modify dataset to contain sequences. This is required for the RNN. Each sequence is SEQUENCE_LENGTH consecutive data points. Currently a sequence can contain data points from two different journeys, this is not optimal. Should check if the sequence starts and ends in the same journey. This will also use a lot of RAM since the data is effectively increased by SEQUENCE_LENGTH times.

In [9]:
SEQUENCE_LENGTH = 20
def sequencify(df, shuffle):
    sequencial_data = []
    sequence = deque(maxlen=SEQUENCE_LENGTH)
    for i in df.values:
        sequence.append([n for n in i[:-1]])
        if len(sequence) == SEQUENCE_LENGTH:
            sequencial_data.append([np.array(sequence), i[-1]])
    if shuffle:
        random.shuffle(sequencial_data)
    X = []
    Y = []
    for sequence, label in sequencial_data:
        X.append(sequence)
        Y.append(label)
    return np.array(X),Y

Creating the sequences of training data, test data and validation data. Test data is not randomized in order to be able to plot it in a nice way.

In [32]:
seq_train_x, seq_train_y = sequencify(train_data, True)
seq_test_x, seq_test_y = sequencify(test_data, False)
seq_validation_x, seq_validation_y = sequencify(validation_data, True)

Model creation

In [14]:
BATCH_SIZE = 64
EPOCHS = 1
model = keras.Sequential()
model.add(keras.layers.CuDNNLSTM(128, input_shape=(seq_train_x.shape[1:]), return_sequences=True))

model.add(keras.layers.CuDNNLSTM(128, input_shape=(seq_train_x.shape[1:]), return_sequences=True))

model.add(keras.layers.CuDNNLSTM(128, input_shape=(seq_train_x.shape[1:])))

model.add(keras.layers.Dense(32, activation="relu"))

model.add(keras.layers.Dense(1))

opt = keras.optimizers.Adadelta()

model.compile(loss='mae',optimizer=opt, metrics=['mae'])

Fit model

In [15]:
result = model.fit(seq_train_x,seq_train_y,
                  batch_size=BATCH_SIZE,
                  epochs=EPOCHS,
                  validation_data=(seq_validation_x,seq_validation_y))

Train on 2024377 samples, validate on 644473 samples
Epoch 1/1


Evaluate model on all test data

In [16]:
 model.evaluate(seq_test_x, seq_test_y)



[10.033909805049095, 10.033909805049095]

Prepare data for the evaluation step

In [33]:
predictions = model.predict(seq_test_x)

### Create dataset for evaluation

In [34]:
segment = df['seg'].loc[test_idxs].reset_index().drop(columns=['index'])

In [37]:
output = pds.DataFrame()
output['seg'] = segment['seg'][19:]
output['journey'] = test_input['journey'][19:].values
output['speed'] = test_data['speed'][19:].values
output['pred'] = predictions
output['label'] = test_data['time_left'][19:].values
output = output.reset_index(drop = True)
output.head()

Unnamed: 0,seg,journey,speed,pred,label
19,1,1,0.080934,58.844574,52.0
20,1,1,0.091798,58.919067,51.0
21,1,1,0.125475,55.777885,50.0
22,1,1,0.139598,54.779861,49.0
23,1,1,0.147746,53.91098,48.0


In [39]:
# Same name for all files
model_name = 'ANN_M4'

# Serialize model to YAML
model_yaml = model.to_yaml()
with open(model_name + '.yaml', "w") as yaml_file:
    yaml_file.write(model_yaml)

# Serialize weights to HDF5
model.save_weights(model_name + '.h5')

In [41]:
output.to_pickle(model_name + '.pkl')

Done! Get ready for evaluation