# Model creation and data split-up

This notebook will:
* Split data ito test and train data
* Train the model
* Save the model

Import packages

In [16]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pds
from sklearn import preprocessing as pp

Import data

In [2]:
df_tr = pds.read_pickle('ANN_data_train.pkl')
df_te = pds.read_pickle('ANN_data_test.pkl')
df = pds.concat([df_tr, df_te], keys=['train', 'test'])
df.head()

Unnamed: 0,Unnamed: 1,timestamp,lon,lat,direction,speed,journey,seg,time_left,segment_time,tsjs,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three
train,0,2018-02-16 04:48:40+01:00,58.414238,15.571015,147.300003,0.0,1,1,71.0,71.0,0.0,15.571015,58.414238,15.571015,58.414238,15.571015,58.414238,15.571015,15.571015,15.571015
train,1,2018-02-16 04:48:41+01:00,58.414246,15.571012,147.300003,0.0,1,1,70.0,71.0,0.0,15.571015,58.414238,15.571015,58.414238,15.571015,58.414238,15.571015,15.571015,15.571015
train,2,2018-02-16 04:48:42+01:00,58.414249,15.571008,147.300003,0.0,1,1,69.0,71.0,0.0,15.571012,58.414246,15.571015,58.414238,15.571015,58.414238,15.571012,15.571015,15.571015
train,3,2018-02-16 04:48:43+01:00,58.414257,15.571004,147.300003,0.0,1,1,68.0,71.0,0.0,15.571008,58.414249,15.571012,58.414246,15.571015,58.414238,15.571008,15.571012,15.571015
train,4,2018-02-16 04:48:44+01:00,58.414257,15.571006,147.300003,0.0,1,1,67.0,71.0,0.0,15.571004,58.414257,15.571008,58.414249,15.571012,58.414246,15.571004,15.571008,15.571012


In [3]:
#df = pds.read_pickle('ANN_dataset.pkl')
#df.head()

Note that `time_left` is the label in this model 

In [4]:
label_string = 'time_left'

One hot encode segments, normalize columns and create final dataset

In [26]:
# Make segments categorical
dataset = pds.get_dummies(df['seg'])

# Combine segments, timestamps and time from journey start
mm = pp.MinMaxScaler()

# Cyclical timestamps 
dataset['hr_sin'] = np.sin(df.timestamp.dt.hour*(2.*np.pi/24))
dataset['hr_cos'] = np.cos(df.timestamp.dt.hour*(2.*np.pi/24))

# Convert to radians befor trigonometric functions
dataset['dir_sin'] = np.sin(df.direction*(np.pi/180))
dataset['dir_cos'] = np.cos(df.direction*(np.pi/180))

dataset['speed'] = (df['speed']-df['speed'].min())/(df['speed'].max()-df['speed'].min())
dataset['tsjs'] = (df['tsjs']-df['tsjs'].min())/(df['tsjs'].max()-df['tsjs'].min())

dataset['lat'] = (df['lat']-df['lat'].min())/(df['lat'].max()-df['lat'].min())
dataset['lon'] = (df['lon']-df['lon'].min())/(df['lon'].max()-df['lon'].min())

dataset['lat_one'] = mm.fit_transform(df.lat_one.values.reshape(-1,1))
dataset['lon_one'] = mm.fit_transform(df.lon_one.values.reshape(-1,1))

dataset['lat_two'] = mm.fit_transform(df.lat_two.values.reshape(-1,1))
dataset['lon_two'] = mm.fit_transform(df.lon_two.values.reshape(-1,1))

dataset['lat_three'] = mm.fit_transform(df.lat_three.values.reshape(-1,1))
dataset['lon_three'] = mm.fit_transform(df.lon_three.values.reshape(-1,1))

dataset['speed_one'] = mm.fit_transform(df.speed_one.values.reshape(-1,1))
dataset['speed_two'] = mm.fit_transform(df.speed_two.values.reshape(-1,1))
dataset['speed_three'] = mm.fit_transform(df.speed_three.values.reshape(-1,1))

# Include journey number to select entire journeys
dataset['journey'] = df['journey']

dataset.head()

Unnamed: 0,Unnamed: 1,1,2,3,4,5,6,7,8,9,10,...,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three,journey
train,0,1,0,0,0,0,0,0,0,0,0,...,0.326606,0.924521,0.327034,0.924521,0.327489,0.924521,0.326606,0.327034,0.327489,1
train,1,1,0,0,0,0,0,0,0,0,0,...,0.326606,0.924521,0.327034,0.924521,0.327489,0.924521,0.326606,0.327034,0.327489,1
train,2,1,0,0,0,0,0,0,0,0,0,...,0.326526,0.925287,0.327034,0.924521,0.327489,0.924521,0.326526,0.327034,0.327489,1
train,3,1,0,0,0,0,0,0,0,0,0,...,0.326445,0.92567,0.326953,0.925287,0.327489,0.924521,0.326445,0.326953,0.327489,1
train,4,1,0,0,0,0,0,0,0,0,0,...,0.326365,0.926437,0.326873,0.92567,0.327408,0.925287,0.326365,0.326873,0.327408,1


Select an arbitrary number of test journeys

divider = 0.2
num_journeys = dataset.journey.unique()[-1]
num_test_segments = np.int_(np.round(num_journeys*divider))
msk = np.random.randint(1, num_journeys, num_test_segments)
mask = dataset['journey'].isin(msk)

Divide training and test data, where data is a sequence from a journey

train_input = dataset[~mask]
test_input = dataset[mask]

train_labels = df[label_string][~mask]
test_labels = df[label_string][mask]

test_idxs = test_input.index

In [27]:
train_input = dataset.loc['train']
test_input = dataset.loc['test']

train_labels = df.loc['train'][label_string]
test_labels = df.loc['test'][label_string]

test_idxs = test_labels.index

In [28]:
# Journey numbers were needed up until this point
train_data = train_input.drop(columns=['journey'])
test_data = test_input.drop(columns=['journey'])

Build model

In [29]:
model = keras.Sequential([
	keras.layers.Dense(2*len(train_data.columns), activation=tf.nn.relu, input_shape=(train_data.shape[1],)),
    keras.layers.Dense(1*len(train_data.columns), activation=tf.nn.tanh),
    keras.layers.Dense(1*len(train_data.columns)),
	keras.layers.Dense(1, activation=tf.nn.relu)
	])

optimizer = keras.optimizers.Adadelta()
model.compile(loss='mae', optimizer = optimizer, metrics=['mae'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 56)                1624      
_________________________________________________________________
dense_5 (Dense)              (None, 28)                1596      
_________________________________________________________________
dense_6 (Dense)              (None, 28)                812       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 29        
Total params: 4,061
Trainable params: 4,061
Non-trainable params: 0
_________________________________________________________________
None


Fit model, 10% of the training data is used as validation set

In [30]:
EPOCHS=30
result = model.fit(train_data.values, 
                   train_labels.values, 
                   epochs = EPOCHS, 
                   validation_split=0.1, 
                   initial_epoch=0,
                   batch_size=32
                  )

Train on 2384639 samples, validate on 264960 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Evaluate model on test data

In [31]:
model.evaluate(test_data.values, test_labels.values)



[11.294563378343492, 11.294563378343492]

Prepare data for evaluation step

In [32]:
predictions = model.predict(test_data.values)

### Create dataset for evaluation

In [37]:
segment = df['seg'].loc['test'].reset_index(drop=True)

In [40]:
output = pds.DataFrame()
output['seg'] = segment
output['journey'] = test_input['journey'].values
output['speed'] = test_data.speed.values
output['pred'] = predictions
output['label'] = test_labels.values
output.head()

Unnamed: 0,seg,journey,speed,pred,label
0,1,14,0.0,127.827164,209.0
1,1,14,0.0,127.827164,208.0
2,1,14,0.0,127.827164,207.0
3,1,14,0.0,127.827164,206.0
4,1,14,0.0,127.827164,205.0


In [41]:
# Same name for all files
model_name = 'ANN_M3'

# serialize model to YAML
model_yaml = model.to_yaml()
with open(model_name + '.yaml', 'w') as yaml_file:
    yaml_file.write(model_yaml)

# Serialize weights to HDF5
model.save_weights(model_name + '.h5')

In [42]:
output.to_pickle(model_name + '.pkl')

Done! Get ready for evaluation