# Model creation and data split-up

This notebook will:
* Split data ito test and train data
* Train the model
* Save the model

Import packages

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pds

Import data

In [2]:
df = pds.read_pickle('ANN_dataset.pkl')
df.head()

Unnamed: 0,timestamp,lon,lat,direction,speed,journey,seg,time_left,segment_time,tsjs
0,2018-02-16 04:48:40+01:00,58.414238,15.571015,147.300003,0.0,1,1,71.0,71.0,0.0
1,2018-02-16 04:48:41+01:00,58.414246,15.571012,147.300003,0.0,1,1,70.0,71.0,0.0
2,2018-02-16 04:48:42+01:00,58.414249,15.571008,147.300003,0.0,1,1,69.0,71.0,0.0
3,2018-02-16 04:48:43+01:00,58.414257,15.571004,147.300003,0.0,1,1,68.0,71.0,0.0
4,2018-02-16 04:48:44+01:00,58.414257,15.571006,147.300003,0.0,1,1,67.0,71.0,0.0


Note that `time_left` is the label in this model 

In [3]:
label_string = 'time_left'

One hot encode segments, normalize columns and create final dataset

In [4]:
# Make segments categorical
dataset = pds.get_dummies(df['seg'])

# Combine segments, timestamps and time from journey start

# Cyclical timestamps 
dataset['hr_sin'] = np.sin(df.timestamp.dt.hour*(2.*np.pi/24))
dataset['hr_cos'] = np.cos(df.timestamp.dt.hour*(2.*np.pi/24))

# Convert to radians befor trigonometric functions
dataset['dir_sin'] = np.sin(df.direction*(np.pi/180))
dataset['dir_cos'] = np.cos(df.direction*(np.pi/180))

dataset['speed'] = (df['speed']-df['speed'].min())/(df['speed'].max()-df['speed'].min())
dataset['tsjs'] = (df['tsjs']-df['tsjs'].min())/(df['tsjs'].max()-df['tsjs'].min())

dataset['lat'] = (df['lat']-df['lat'].min())/(df['lat'].max()-df['lat'].min())
dataset['lon'] = (df['lon']-df['lon'].min())/(df['lon'].max()-df['lon'].min())

# Include journey number to select entire journeys
dataset['journey'] = df['journey']

dataset.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,hr_sin,hr_cos,dir_sin,dir_cos,speed,tsjs,lat,lon,journey
0,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.54024,-0.841511,0.0,0.0,0.326317,0.924521,1
1,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.54024,-0.841511,0.0,0.0,0.326237,0.925287,1
2,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.54024,-0.841511,0.0,0.0,0.326156,0.92567,1
3,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.54024,-0.841511,0.0,0.0,0.326076,0.926437,1
4,1,0,0,0,0,0,0,0,0,0,0,0.866025,0.5,0.54024,-0.841511,0.0,0.0,0.326116,0.926437,1


Add columns for the three latest positions for each sample. Replace the first sample from each journey with itself.

In [5]:
history_speed = pds.DataFrame(0, index=np.arange(len(dataset)), columns=['speed_one', 'speed_two', 'speed_three'])
history_pos = pds.DataFrame(0, index=np.arange(len(dataset)), columns=['lat_one', 'lon_one', 'lat_two', 'lon_two', 'lat_three', 'lon_three'])

In [6]:
for i, a in dataset.groupby('journey'):
    idxs = a.index

    history_pos.lat_one.iloc[idxs] = a.lat.shift(1)
    history_pos.lat_one.iloc[idxs[0]] = history_pos.lat_one.iloc[1]

    history_pos.lon_one.iloc[idxs] = a.lon.shift(1)
    history_pos.lon_one.iloc[idxs[0]] = history_pos.lon_one.iloc[1]

    history_pos.lat_two.iloc[idxs] = a.lat.shift(2)
    history_pos.lat_two.iloc[idxs[0]] = history_pos.lat_two.iloc[2]
    history_pos.lat_two.iloc[idxs[1]] = history_pos.lat_two.iloc[2]

    history_pos.lon_two.iloc[idxs] = a.lon.shift(2)
    history_pos.lon_two.iloc[idxs[0]] = history_pos.lon_two.iloc[2]
    history_pos.lon_two.iloc[idxs[1]] = history_pos.lon_two.iloc[2]

    history_pos.lat_three.iloc[idxs] = a.lat.shift(3)
    history_pos.lat_three.iloc[idxs[0]] = history_pos.lat_three.iloc[3]
    history_pos.lat_three.iloc[idxs[1]] = history_pos.lat_three.iloc[3]
    history_pos.lat_three.iloc[idxs[2]] = history_pos.lat_three.iloc[3]

    history_pos.lon_three.iloc[idxs] = a.lon.shift(3)
    history_pos.lon_three.iloc[idxs[0]] = history_pos.lon_three.iloc[3]
    history_pos.lon_three.iloc[idxs[1]] = history_pos.lon_three.iloc[3]
    history_pos.lon_three.iloc[idxs[2]] = history_pos.lon_three.iloc[3]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
for i, a in dataset.groupby('journey'):
    idxs = a.index

    history_speed.speed_one.iloc[idxs] = a.lat.shift(1)
    history_speed.speed_one.iloc[idxs[0]] = history_speed.speed_one.iloc[1]

    history_speed.speed_two.iloc[idxs] = a.lat.shift(2)
    history_speed.speed_two.iloc[idxs[0]] = history_speed.speed_two.iloc[2]
    history_speed.speed_two.iloc[idxs[1]] = history_speed.speed_two.iloc[2]

    history_speed.speed_three.iloc[idxs] = a.lat.shift(3)
    history_speed.speed_three.iloc[idxs[0]] = history_speed.speed_three.iloc[3]
    history_speed.speed_three.iloc[idxs[1]] = history_speed.speed_three.iloc[3]
    history_speed.speed_three.iloc[idxs[2]] = history_speed.speed_three.iloc[3]

Make sure there are no NaN:s produced by the shifting

In [8]:
history_pos.isna().sum()

lat_one      0
lon_one      0
lat_two      0
lon_two      0
lat_three    0
lon_three    0
dtype: int64

In [9]:
history_speed.isna().sum()

speed_one      0
speed_two      0
speed_three    0
dtype: int64

The two cells above should contain only zeros, otherwise, model training will fail!

In [10]:
history_pos.head()

Unnamed: 0,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three
0,0.326317,0.924521,0.326317,0.924521,0.326317,0.924521
1,0.326317,0.924521,0.326317,0.924521,0.326317,0.924521
2,0.326237,0.925287,0.326317,0.924521,0.326317,0.924521
3,0.326156,0.92567,0.326237,0.925287,0.326317,0.924521
4,0.326076,0.926437,0.326156,0.92567,0.326237,0.925287


In [11]:
history_speed.head()

Unnamed: 0,speed_one,speed_two,speed_three
0,0.326317,0.326317,0.326317
1,0.326317,0.326317,0.326317
2,0.326237,0.326317,0.326317
3,0.326156,0.326237,0.326317
4,0.326076,0.326156,0.326237


Looking good! Add to dataset

In [14]:
new_data = pds.concat([history_pos, history_speed], axis=1)
dataset = pds.concat([dataset, new_data], axis=1)
dataset.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,speed_three,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three.1
0,1,0,0,0,0,0,0,0,0,0,...,0.326317,0.326317,0.924521,0.326317,0.924521,0.326317,0.924521,0.326317,0.326317,0.326317
1,1,0,0,0,0,0,0,0,0,0,...,0.326317,0.326317,0.924521,0.326317,0.924521,0.326317,0.924521,0.326317,0.326317,0.326317
2,1,0,0,0,0,0,0,0,0,0,...,0.326317,0.326237,0.925287,0.326317,0.924521,0.326317,0.924521,0.326237,0.326317,0.326317
3,1,0,0,0,0,0,0,0,0,0,...,0.326317,0.326156,0.92567,0.326237,0.925287,0.326317,0.924521,0.326156,0.326237,0.326317
4,1,0,0,0,0,0,0,0,0,0,...,0.326237,0.326076,0.926437,0.326156,0.92567,0.326237,0.925287,0.326076,0.326156,0.326237


The column names can also be listed as:

In [15]:
list(dataset)

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 'hr_sin',
 'hr_cos',
 'dir_sin',
 'dir_cos',
 'speed',
 'tsjs',
 'lat',
 'lon',
 'journey',
 'lat_one',
 'lon_one',
 'lat_two',
 'lon_two',
 'lat_three',
 'lon_three',
 'speed_one',
 'speed_two',
 'speed_three',
 'lat_one',
 'lon_one',
 'lat_two',
 'lon_two',
 'lat_three',
 'lon_three',
 'speed_one',
 'speed_two',
 'speed_three']

Select an arbitrary number of test journeys

In [16]:
divider = 0.2
num_journeys = dataset.journey.unique()[-1]
num_test_segments = np.int_(np.round(num_journeys*divider))
msk = np.random.randint(1, num_journeys, num_test_segments)
mask = dataset['journey'].isin(msk)

Divide training and test data, where data is a sequence from a journey

In [32]:
train_input = dataset[~mask]
test_input = dataset[mask]

train_labels = df[label_string][~mask]
test_labels = df[label_string][mask]

test_idxs = test_input.index

In [18]:
# Journey numbers were needed up until this point
train_data = train_input.drop(columns=['journey'])
test_data = test_input.drop(columns=['journey'])

Build model

In [25]:
model = keras.Sequential([
	keras.layers.Dense(2*len(train_data.columns), input_shape=(train_data.shape[1],)),
    keras.layers.Dense(2*len(train_data.columns), activation=tf.nn.tanh),
    keras.layers.Dense(1*len(train_data.columns)),
	keras.layers.Dense(1, activation=tf.nn.relu)
	])

optimizer = keras.optimizers.Adadelta()
model.compile(loss='mae', optimizer = optimizer, metrics=['mae'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 74)                2812      
_________________________________________________________________
dense_8 (Dense)              (None, 74)                5550      
_________________________________________________________________
dense_9 (Dense)              (None, 37)                2775      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 38        
Total params: 11,175
Trainable params: 11,175
Non-trainable params: 0
_________________________________________________________________
None


Fit model, 10% of the training data is used as validation set

In [26]:
EPOCHS=20
result = model.fit(train_data.values, 
                   train_labels.values, 
                   epochs = EPOCHS, 
                   validation_split=0.1, 
                   initial_epoch=0,
                   batch_size=64
                  )

Train on 2432621 samples, validate on 270292 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Evaluate model on test data

In [27]:
model.evaluate(test_data.values, test_labels.values)



[11.088943287898411, 11.088943287898411]

Prepare data for evaluation step

In [29]:
predictions = model.predict(test_data.values)

### Create dataset for evaluation

In [38]:
segment = df['seg'].loc[test_idxs].reset_index().drop(columns=['index'])

In [39]:
output = pds.DataFrame()
output['seg'] = segment['seg']
output['journey'] = test_input['journey'].values
output['speed'] = test_data.speed.values
output['pred'] = predictions
output['label'] = test_labels.values
output.head()

Unnamed: 0,seg,journey,speed,pred,label
0,1,5,0.0,127.258392,193.0
1,1,5,0.0,127.29985,192.0
2,1,5,0.0,127.319183,191.0
3,1,5,0.0,127.328079,190.0
4,1,5,0.0,127.330658,189.0


In [40]:
# Same name for all files
model_name = 'ANN_M3'

# serialize model to YAML
model_yaml = model.to_yaml()
with open(model_name + '.yaml', 'w') as yaml_file:
    yaml_file.write(model_yaml)

# Serialize weights to HDF5
model.save_weights(model_name + '.h5')

In [41]:
output.to_pickle(model_name + '.pkl')

Done! Get ready for evaluation