# Model creation and data split-up

This notebook will:
* Train the model
* Save the model and predictions

Import packages

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pds

Import data

In [2]:
df_tr = pds.read_pickle('ANN_data_train_203.pkl')
df_te = pds.read_pickle('ANN_data_test_203.pkl')
df = pds.concat([df_tr, df_te], keys=['train', 'test'])
df.head()

Unnamed: 0,Unnamed: 1,timestamp,lon,lat,direction,speed,journey,seg,time_left,segment_time,tsjs,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three
train,0,2018-02-16 04:48:40+01:00,0.924492,0.326468,147.300003,0.0,1,1,71.0,71.0,0.0,0.326468,0.924492,0.326468,0.924492,0.326468,0.924492,0.326468,0.326468,0.326468
train,1,2018-02-16 04:48:41+01:00,0.925259,0.326388,147.300003,0.0,1,1,70.0,71.0,0.0,0.326468,0.924492,0.326468,0.924492,0.326468,0.924492,0.326468,0.326468,0.326468
train,2,2018-02-16 04:48:42+01:00,0.925642,0.326307,147.300003,0.0,1,1,69.0,71.0,0.0,0.326388,0.925259,0.326468,0.924492,0.326468,0.924492,0.326388,0.326468,0.326468
train,3,2018-02-16 04:48:43+01:00,0.926409,0.326227,147.300003,0.0,1,1,68.0,71.0,0.0,0.326307,0.925642,0.326388,0.925259,0.326468,0.924492,0.326307,0.326388,0.326468
train,4,2018-02-16 04:48:44+01:00,0.926409,0.326267,147.300003,0.0,1,1,67.0,71.0,0.0,0.326227,0.926409,0.326307,0.925642,0.326388,0.925259,0.326227,0.326307,0.326388


In [3]:
#df = pds.read_pickle('ANN_dataset.pkl')
#df.head()

Note that `segment_time` is the label in this model 

In [4]:
label_string = 'segment_time'

One hot encode segments, normalize columns and create final dataset

In [5]:
# Make segments categorical
dataset = pds.get_dummies(df['seg'])

# Combine segments, timestamps and time from journey start

# Cyclical timestamps 
dataset['hr_sin'] = np.sin(df.timestamp.dt.hour*(2.*np.pi/24))
dataset['hr_cos'] = np.cos(df.timestamp.dt.hour*(2.*np.pi/24))

dataset['tsjs'] = (df['tsjs']-df['tsjs'].min())/(df['tsjs'].max()-df['tsjs'].min())

# Include journey number to select entire journeys
dataset['journey'] = df['journey']
dataset.head()

Unnamed: 0,Unnamed: 1,1,2,3,4,5,6,7,8,9,10,11,hr_sin,hr_cos,tsjs,journey
train,0,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
train,1,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
train,2,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
train,3,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
train,4,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1


Select ~20% test journeys

### If the data is not split into train and test, this chunk chould be run
divider = 0.2
num_journeys = dataset.journey.unique()[-1]
num_test_segments = np.int_(np.round(num_journeys*divider))
msk = np.random.randint(1, num_journeys, num_test_segments)
mask = dataset['journey'].isin(msk)

Divide training and test data, where data is a sequence from a journey

### If the data is not split into train and test, this chunk chould be run
train_input = dataset[~mask]
test_input = dataset[mask]

train_labels = df[label_string][~mask]
test_labels = df[label_string][mask]

test_idxs = test_input.index

In [6]:
train_input = dataset.loc['train']
test_input = dataset.loc['test']

train_labels = df.loc['train'][label_string]
test_labels = df.loc['test'][label_string]

test_idxs = test_labels.index

In [7]:
# Journey are not needed up until this point
train_data = train_input.drop(columns=['journey'])
test_data = test_input.drop(columns=['journey'])

Build model

In [8]:
model = keras.Sequential([
	keras.layers.Dense(2*len(train_data.columns), activation=tf.nn.relu, input_shape=(train_data.shape[1],)),
    keras.layers.Dense(1*len(train_data.columns), activation=tf.nn.tanh),
    keras.layers.Dense(1*len(train_data.columns)),
	keras.layers.Dense(1, activation=tf.nn.relu)
	])

optimizer = keras.optimizers.Adadelta()
model.compile(loss='mae', optimizer = optimizer, metrics=['mae'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 28)                420       
_________________________________________________________________
dense_1 (Dense)              (None, 14)                406       
_________________________________________________________________
dense_2 (Dense)              (None, 14)                210       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 15        
Total params: 1,051
Trainable params: 1,051
Non-trainable params: 0
_________________________________________________________________
None


Fit model, 10% of all training data is used as validation data

In [9]:
EPOCHS=10
model.fit(train_data.values, 
          train_labels.values, 
          epochs = EPOCHS, 
          validation_split=0.1,
          batch_size=32,
          initial_epoch=0
         )

Train on 2382606 samples, validate on 264734 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1e9814ee0b8>

Evaluate model on all test data

In [10]:
model.evaluate(test_data.values, test_labels.values)



[17.951296374361785, 17.951296374361785]

In [11]:
predictions = model.predict(test_data.values)

### Create dataset for evaluation

This dataset is evaluated in a special way. The time spent from the start of the segment is also needed to predict ETA. `tsss` = time since segment start

In [12]:
tsss = (test_labels - df['time_left'].loc['test']).reset_index(drop=True)

In [13]:
pred = predictions.flatten() - tsss
pred.names = 'pred'
pred.head()

0    185.085464
1    184.085464
2    183.085464
3    182.085464
4    181.085464
dtype: float64

In [14]:
labels = test_labels.reset_index(drop=True) - tsss.values
labels.head()

0    188.0
1    187.0
2    186.0
3    185.0
4    184.0
Name: segment_time, dtype: float64

In [15]:
speed = (df['speed']-df['speed'].min())/(df['speed'].max()-df['speed'].min())
speed = speed.loc['test'].reset_index(drop=True)
speed.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: speed, dtype: float64

In [16]:
segment = df['seg'].loc['test'].reset_index(drop=True)
segment.head()

0    1
1    1
2    1
3    1
4    1
Name: seg, dtype: int64

Prepare data for the evaluation step

In [17]:
output = pds.DataFrame()
output['seg'] = segment
output['journey'] = test_input['journey'].values
output['speed'] = speed.values
output['pred'] = pred.values
output['label'] = labels
output.head()

Unnamed: 0,seg,journey,speed,pred,label
0,1,12,0.0,185.085464,188.0
1,1,12,0.0,184.085464,187.0
2,1,12,0.0,183.085464,186.0
3,1,12,0.0,182.085464,185.0
4,1,12,0.0,181.085464,184.0


In [19]:
# Same name for all files
model_name = 'ANN_M1_203_benchmark'

# Serialize model to YAML
model_yaml = model.to_yaml()
with open(model_name + '.yaml', "w") as yaml_file:
    yaml_file.write(model_yaml)

# Serialize weights to HDF5
model.save_weights(model_name + '.h5')

In [20]:
output.to_pickle(model_name + '.pkl')

Done! Get ready for evaluation