# Model creation and data split-up

This notebook will:
* Split data ito test and train data
* Train the model
* Save the model

Import packages

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pds
from sklearn import preprocessing as pp

Import data

In [2]:
df_tr = pds.read_pickle('ANN_train_203_downsampled.pkl')
df_te = pds.read_pickle('ANN_test_203_downsampled.pkl')
df = pds.concat([df_tr, df_te], keys=['train', 'test'])
df.head()

Unnamed: 0,Unnamed: 1,direction,journey,lat,lon,seg,speed,timestamp,time_left,segment_time,tsjs,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three
train,0,129.852944,1,0.32503,0.926476,1,0.05152,2018-02-16 04:48:46+01:00,65.0,65.0,0.0,0.32503,0.926476,0.32503,0.926476,0.32503,0.926476,0.32503,0.32503,0.32503
train,1,142.963046,1,0.329938,0.928692,1,0.210201,2018-02-16 04:49:05+01:00,46.0,65.0,0.0,0.32503,0.926476,0.32503,0.926476,0.32503,0.926476,0.32503,0.32503,0.32503
train,2,287.399994,1,0.326437,0.944423,1,0.276662,2018-02-16 04:49:17+01:00,34.0,65.0,0.0,0.329938,0.928692,0.32503,0.926476,0.32503,0.926476,0.329938,0.32503,0.32503
train,3,289.790009,1,0.322491,0.946723,1,0.398248,2018-02-16 04:49:18+01:00,33.0,65.0,0.0,0.326437,0.944423,0.329938,0.928692,0.32503,0.926476,0.326437,0.329938,0.32503
train,4,288.200012,1,0.320008,0.948639,1,0.427615,2018-02-16 04:49:19+01:00,32.0,65.0,0.0,0.322491,0.946723,0.326437,0.944423,0.329938,0.928692,0.322491,0.326437,0.329938


In [3]:
#df = pds.read_pickle('ANN_dataset.pkl')
#df.head()

Note that `time_left` is the label in this model 

In [3]:
label_string = 'time_left'

One hot encode segments, normalize columns and create final dataset

In [4]:
# Make segments categorical
dataset = pds.get_dummies(df['seg'])

# Combine segments, timestamps and time from journey start
mm = pp.MinMaxScaler()

# Cyclical timestamps 
dataset['hr_sin'] = np.sin(df.timestamp.dt.hour*(2.*np.pi/24))
dataset['hr_cos'] = np.cos(df.timestamp.dt.hour*(2.*np.pi/24))

# Convert to radians befor trigonometric functions
dataset['dir_sin'] = np.sin(df.direction*(np.pi/180))
dataset['dir_cos'] = np.cos(df.direction*(np.pi/180))


dataset['tsjs'] = (df['tsjs']-df['tsjs'].min())/(df['tsjs'].max()-df['tsjs'].min())

dataset['lat'] = df['lat']
dataset['lon'] = df['lon']
dataset['speed'] = df['speed']

dataset['lat_one'] = df['lat_one']
dataset['lon_one'] = df['lon_one']

dataset['lat_two'] = df['lat_two']
dataset['lon_two'] = df['lon_two']

dataset['lat_three'] = df['lat_three']
dataset['lon_three'] = df['lon_three']

dataset['speed_one'] = df['speed_one']
dataset['speed_two'] = df['speed_two']
dataset['speed_three'] = df['speed_three']

# Include journey number to select entire journeys
dataset['journey'] = df['journey']

dataset.head()

Unnamed: 0,Unnamed: 1,1,2,3,4,5,6,7,8,9,10,...,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three,journey
train,0,1,0,0,0,0,0,0,0,0,0,...,0.32503,0.926476,0.32503,0.926476,0.32503,0.926476,0.32503,0.32503,0.32503,1
train,1,1,0,0,0,0,0,0,0,0,0,...,0.32503,0.926476,0.32503,0.926476,0.32503,0.926476,0.32503,0.32503,0.32503,1
train,2,1,0,0,0,0,0,0,0,0,0,...,0.329938,0.928692,0.32503,0.926476,0.32503,0.926476,0.329938,0.32503,0.32503,1
train,3,1,0,0,0,0,0,0,0,0,0,...,0.326437,0.944423,0.329938,0.928692,0.32503,0.926476,0.326437,0.329938,0.32503,1
train,4,1,0,0,0,0,0,0,0,0,0,...,0.322491,0.946723,0.326437,0.944423,0.329938,0.928692,0.322491,0.326437,0.329938,1


Select an arbitrary number of test journeys

divider = 0.2
num_journeys = dataset.journey.unique()[-1]
num_test_segments = np.int_(np.round(num_journeys*divider))
msk = np.random.randint(1, num_journeys, num_test_segments)
mask = dataset['journey'].isin(msk)

Divide training and test data, where data is a sequence from a journey

train_input = dataset[~mask]
test_input = dataset[mask]

train_labels = df[label_string][~mask]
test_labels = df[label_string][mask]

test_idxs = test_input.index

In [5]:
train_input = dataset.loc['train']
test_input = dataset.loc['test']

train_labels = df.loc['train'][label_string]
test_labels = df.loc['test'][label_string]

test_idxs = test_labels.index

In [6]:
# Journey numbers were needed up until this point
train_data = train_input.drop(columns=['journey'])
test_data = test_input.drop(columns=['journey'])

Build model

In [7]:
model = keras.Sequential([
	keras.layers.Dense(2*len(train_data.columns), activation=tf.nn.relu, input_shape=(train_data.shape[1],)),
    keras.layers.Dense(1*len(train_data.columns), activation=tf.nn.tanh),
    keras.layers.Dense(1*len(train_data.columns)),
	keras.layers.Dense(1, activation=tf.nn.relu)
	])

optimizer = keras.optimizers.Adadelta()
model.compile(loss='mae', optimizer = optimizer, metrics=['mae'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 56)                1624      
_________________________________________________________________
dense_1 (Dense)              (None, 28)                1596      
_________________________________________________________________
dense_2 (Dense)              (None, 28)                812       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 29        
Total params: 4,061
Trainable params: 4,061
Non-trainable params: 0
_________________________________________________________________
None


Fit model, 10% of the training data is used as validation set

In [8]:
EPOCHS=10
result = model.fit(train_data.values, 
                   train_labels.values, 
                   epochs = EPOCHS, 
                   validation_split=0.1, 
                   initial_epoch=0,
                   batch_size=32
                  )

Train on 1354825 samples, validate on 150537 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Evaluate model on test data

In [9]:
model.evaluate(test_data.values, test_labels.values)



[3.50613035198306, 3.50613035198306]

Prepare data for evaluation step

In [10]:
predictions = model.predict(test_data.values)

### Create dataset for evaluation

In [11]:
segment = df['seg'].loc['test'].reset_index(drop=True)

In [12]:
output = pds.DataFrame()
output['seg'] = segment
output['journey'] = test_input['journey'].values
output['speed'] = test_data.speed.values
output['pred'] = predictions
output['label'] = test_labels.values
output.head()

Unnamed: 0,seg,journey,speed,pred,label
0,1,12,0.213421,50.560806,55.0
1,1,12,0.267877,47.710388,46.0
2,1,12,0.308581,35.432022,37.0
3,1,12,0.351485,34.829105,36.0
4,1,12,0.39659,33.883369,35.0


In [13]:
# Same name for all files
model_name = 'ANN_M3_203_downsampled_benchmark'

# serialize model to YAML
model_yaml = model.to_yaml()
with open(model_name + '.yaml', 'w') as yaml_file:
    yaml_file.write(model_yaml)

# Serialize weights to HDF5
model.save_weights(model_name + '.h5')

In [14]:
output.to_pickle(model_name + '.pkl')

Done! Get ready for evaluation