# Model creation and data split-up

This notebook will:
* Split data ito test and train data
* Train the model
* Save the model

Import packages

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pds

Import data

In [2]:
df_tr = pds.read_pickle('ANN_train_203_downsampled.pkl')
df_te = pds.read_pickle('ANN_test_203_downsampled.pkl')
df = pds.concat([df_tr, df_te], keys=['train', 'test'])
df.head()

Unnamed: 0,Unnamed: 1,direction,journey,lat,lon,seg,speed,timestamp,time_left,segment_time,tsjs,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three
train,0,129.852944,1,15.571012,58.414258,1,0.0,2018-02-16 04:48:46+01:00,65.0,65.0,0.0,15.571012,58.414258,15.571012,58.414258,15.571012,58.414258,15.571012,15.571012,15.571012
train,1,142.963046,1,15.571246,58.41428,1,3.08,2018-02-16 04:49:05+01:00,46.0,65.0,0.0,15.571012,58.414258,15.571012,58.414258,15.571012,58.414258,15.571012,15.571012,15.571012
train,2,287.399994,1,15.571079,58.414436,1,4.37,2018-02-16 04:49:17+01:00,34.0,65.0,0.0,15.571246,58.41428,15.571012,58.414258,15.571012,58.414258,15.571246,15.571012,15.571012
train,3,289.790009,1,15.570891,58.414459,1,6.73,2018-02-16 04:49:18+01:00,33.0,65.0,0.0,15.571079,58.414436,15.571246,58.41428,15.571012,58.414258,15.571079,15.571246,15.571012
train,4,288.200012,1,15.570773,58.414478,1,7.3,2018-02-16 04:49:19+01:00,32.0,65.0,0.0,15.570891,58.414459,15.571079,58.414436,15.571246,58.41428,15.570891,15.571079,15.571246


In [3]:
#df = pds.read_pickle('ANN_dataset.pkl')
#df.head()

Note that `segment_time` is the label in this model 

In [4]:
label_string = 'segment_time'

One hot encode segments, normalize columns and create final dataset

In [5]:
# Make segments categorical
dataset = pds.get_dummies(df['seg'])

# Combine segments, timestamps and time from journey start

# Cyclical timestamps 
dataset['hr_sin'] = np.sin(df.timestamp.dt.hour*(2.*np.pi/24))
dataset['hr_cos'] = np.cos(df.timestamp.dt.hour*(2.*np.pi/24))

dataset['tsjs'] = (df['tsjs']-df['tsjs'].min())/(df['tsjs'].max()-df['tsjs'].min())

# Include journey number to select entire journeys
dataset['journey'] = df['journey']
dataset.head()

Unnamed: 0,Unnamed: 1,1,2,3,4,5,6,7,8,9,10,11,hr_sin,hr_cos,tsjs,journey
train,0,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
train,1,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
train,2,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
train,3,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1
train,4,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.0,1


Select ~20% test journeys

divider = 0.2
num_journeys = dataset.journey.unique()[-1]
num_test_segments = np.int_(np.round(num_journeys*divider))
msk = np.random.randint(1, num_journeys, num_test_segments)
mask = dataset['journey'].isin(msk)

Divide training and test data, where data is a sequence from a journey

train_input = dataset[~mask]
test_input = dataset[mask]

train_labels = df[label_string][~mask]
test_labels = df[label_string][mask]

test_idxs = test_input.index

In [6]:
train_input = dataset.loc['train']
test_input = dataset.loc['test']

train_labels = df.loc['train'][label_string]
test_labels = df.loc['test'][label_string]

test_idxs = test_labels.index

In [7]:
# Journey are not needed up until this point
train_data = train_input.drop(columns=['journey'])
test_data = test_input.drop(columns=['journey'])

Build model

In [8]:
model = keras.Sequential([
	keras.layers.Dense(2*len(train_data.columns), activation=tf.nn.relu, input_shape=(train_data.shape[1],)),
    keras.layers.Dense(1*len(train_data.columns), activation=tf.nn.tanh),
    keras.layers.Dense(1*len(train_data.columns)),
	keras.layers.Dense(1, activation=tf.nn.relu)
	])

optimizer = keras.optimizers.Adadelta()
model.compile(loss='mae', optimizer = optimizer, metrics=['mae'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 28)                420       
_________________________________________________________________
dense_1 (Dense)              (None, 14)                406       
_________________________________________________________________
dense_2 (Dense)              (None, 14)                210       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 15        
Total params: 1,051
Trainable params: 1,051
Non-trainable params: 0
_________________________________________________________________
None


Fit model, 10% of all training data is used as validation data

In [9]:
EPOCHS=10
model.fit(train_data.values, 
          train_labels.values, 
          epochs = EPOCHS, 
          validation_split=0.1,
          batch_size=32,
          initial_epoch=0
         )

Train on 1354825 samples, validate on 150537 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1870f39ebe0>

Evaluate model on all test data

In [10]:
model.evaluate(test_data.values, test_labels.values)



[12.566030320766558, 12.566030320766558]

In [11]:
predictions = model.predict(test_data.values)

### Create dataset for evaluation

This dataset is evaluated in a special way. The time spent from the start of the segment is also needed to predict ETA. `tsss` = time since segment start

In [12]:
tsss = (test_labels - df['time_left'].loc['test']).reset_index(drop=True)

In [13]:
pred = predictions.flatten() - tsss
pred.names = 'pred'
pred.head()

0    58.444424
1    49.444424
2    40.444424
3    39.444424
4    38.444424
dtype: float64

In [14]:
labels = test_labels.reset_index(drop=True) - tsss.values
labels.head()

0    55.0
1    46.0
2    37.0
3    36.0
4    35.0
Name: segment_time, dtype: float64

In [15]:
speed = (df['speed']-df['speed'].min())/(df['speed'].max()-df['speed'].min())
speed = speed.loc['test'].reset_index(drop=True)
speed.head()

0    0.199897
1    0.250902
2    0.289026
3    0.329212
4    0.371458
Name: speed, dtype: float64

In [16]:
segment = df['seg'].loc['test'].reset_index(drop=True)
segment.head()

0    1
1    1
2    1
3    1
4    1
Name: seg, dtype: int64

Prepare data for the evaluation step

In [17]:
output = pds.DataFrame()
output['seg'] = segment
output['journey'] = test_input['journey'].values
output['speed'] = speed.values
output['pred'] = pred.values
output['label'] = labels
output.head()

Unnamed: 0,seg,journey,speed,pred,label
0,1,12,0.199897,58.444424,55.0
1,1,12,0.250902,49.444424,46.0
2,1,12,0.289026,40.444424,37.0
3,1,12,0.329212,39.444424,36.0
4,1,12,0.371458,38.444424,35.0


In [18]:
# Same name for all files
model_name = 'ANN_M1_downsampled'

# Serialize model to YAML
model_yaml = model.to_yaml()
with open(model_name + '.yaml', "w") as yaml_file:
    yaml_file.write(model_yaml)

# Serialize weights to HDF5
model.save_weights(model_name + '.h5')

In [19]:
output.to_pickle(model_name + '.pkl')

Done! Get ready for evaluation