# Model creation and data split-up

This notebook will:
* Split data ito test and train data
* Train the model
* Save the model

Import packages

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pds

Import data

In [2]:
df = pds.read_pickle('ANN_dataset.pkl')
df.head()

Unnamed: 0,timestamp,lon,lat,direction,speed,journey,seg,time_left,segment_time,tsjs
0,2018-02-16 04:48:40+01:00,58.414238,15.571015,147.300003,0.0,1,1,71.0,71.0,0.0
1,2018-02-16 04:48:41+01:00,58.414246,15.571012,147.300003,0.0,1,1,70.0,71.0,0.0
2,2018-02-16 04:48:42+01:00,58.414249,15.571008,147.300003,0.0,1,1,69.0,71.0,0.0
3,2018-02-16 04:48:43+01:00,58.414257,15.571004,147.300003,0.0,1,1,68.0,71.0,0.0
4,2018-02-16 04:48:44+01:00,58.414257,15.571006,147.300003,0.0,1,1,67.0,71.0,0.0


Note that `segment_time` is the label in this model 

In [3]:
label_string = 'segment_time'

One hot encode segments, normalize columns and create final dataset

In [4]:
# Make segments categorical
dataset = pds.get_dummies(df['seg'])

# Combine segments, timestamps and time from journey start

# Cyclical timestamps 
dataset['hr_sin'] = np.sin(df.timestamp.dt.hour*(2.*np.pi/24))
dataset['hr_cos'] = np.cos(df.timestamp.dt.hour*(2.*np.pi/24))

dataset['tsjs'] = (df['tsjs']-df['tsjs'].min())/(df['tsjs'].max()-df['tsjs'].min())

# Include journey number to select entire journeys
dataset['journey'] = df['journey']
dataset.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,hr_sin,hr_cos,tsjs,journey
0,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
1,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
2,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
3,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
4,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1


Select ~20% test journeys

In [5]:
divider = 0.2
num_journeys = dataset.journey.unique()[-1]
num_test_segments = np.int_(np.round(num_journeys*divider))
msk = np.random.randint(1, num_journeys, num_test_segments)
mask = dataset['journey'].isin(msk)

Divide training and test data, where data is a sequence from a journey

In [6]:
train_input = dataset[~mask]
test_input = dataset[mask]

train_labels = df[label_string][~mask]
test_labels = df[label_string][mask]

test_idxs = test_input.index

In [7]:
# Journey are not needed up until this point
train_data = train_input.drop(columns=['journey'])
test_data = test_input.drop(columns=['journey'])

Build model

In [8]:
model = keras.Sequential([
	keras.layers.Dense(2*len(train_data.columns), activation=tf.nn.tanh, input_shape=(train_data.shape[1],)),
    keras.layers.Dense(len(train_data.columns), activation=tf.nn.relu),
	keras.layers.Dense(1, activation=tf.nn.relu)
	])

optimizer = keras.optimizers.Adadelta()
model.compile(loss='mae', optimizer = optimizer, metrics=['mae'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 28)                420       
_________________________________________________________________
dense_1 (Dense)              (None, 14)                406       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 15        
Total params: 841
Trainable params: 841
Non-trainable params: 0
_________________________________________________________________
None


Fit model, 10% of all training data is used as validation data

In [9]:
EPOCHS=5
model.fit(train_data.values, 
          train_labels.values, 
          epochs = EPOCHS, 
          validation_split=0.1, 
          initial_epoch=0
         )

Train on 2433960 samples, validate on 270440 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x191dcac1240>

Evaluate model on all test data

In [10]:
model.evaluate(test_data.values, test_labels.values)



[17.246211438775454, 17.246211438775454]

In [11]:
predictions = model.predict(test_data.values)

### Create dataset for evaluation

This dataset is evaluated in a special way. The time spent from the start of the segment is also needed to predict ETA. `tsss` = time since segment start

In [28]:
tsss = (test_labels - df['time_left'].iloc[test_idxs]).reset_index().drop(columns=['index'])

In [29]:
pred = predictions - tsss
pred.names = 'pred'
pred.head()

Unnamed: 0,0
0,187.499466
1,186.499466
2,185.499466
3,184.499466
4,183.499466


In [30]:
labels = test_labels.reset_index().drop(columns=['index']) - tsss.values
labels.head()

Unnamed: 0,segment_time
0,187.0
1,186.0
2,185.0
3,184.0
4,183.0


In [39]:
speed = (df['speed']-df['speed'].min())/(df['speed'].max()-df['speed'].min())
speed = speed.loc[test_idxs].reset_index().drop(columns=['index'])
speed.head()

Unnamed: 0,speed
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [41]:
segments = df['seg'].loc[test_idxs].reset_index().drop(columns=['index'])
segments.head()

Unnamed: 0,seg
0,1
1,1
2,1
3,1
4,1


Prepare data for the evaluation step

In [42]:
output = pds.DataFrame()
output['seg'] = segments['seg']
output['journey'] = test_input['journey'].values
output['speed'] = speed.values
output['pred'] = pred.values
output['label'] = labels
output.head()

Unnamed: 0,seg,journey,speed,pred,label
0,1,4,0.0,187.499466,187.0
1,1,4,0.0,186.499466,186.0
2,1,4,0.0,185.499466,185.0
3,1,4,0.0,184.499466,184.0
4,1,4,0.0,183.499466,183.0


In [19]:
# Same name for all files
model_name = 'ANN_M1'

# Serialize model to YAML
model_yaml = model.to_yaml()
with open(model_name + '.yaml', "w") as yaml_file:
    yaml_file.write(model_yaml)

# Serialize weights to HDF5
model.save_weights(model_name + '.h5')

In [43]:
output.to_pickle(model_name + '.pkl')

Done! Get ready for evaluation