# Model creation and data split-up

This notebook will:
* Split data ito test and train data
* Train the model
* Save the model

Import packages

In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pds
from sklearn import preprocessing as pp

Import data

In [2]:
df_tr = pds.read_pickle('ANN_train_203_downsampled.pkl')
df_te = pds.read_pickle('ANN_test_203_downsampled.pkl')
df = pds.concat([df_tr, df_te], keys=['train', 'test'])
df.head()

Unnamed: 0,Unnamed: 1,direction,journey,lat,lon,seg,speed,timestamp,time_left,segment_time,tsjs,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three
train,0,129.852944,1,15.571012,58.414258,1,0.0,2018-02-16 04:48:46+01:00,65.0,65.0,0.0,15.571012,58.414258,15.571012,58.414258,15.571012,58.414258,15.571012,15.571012,15.571012
train,1,142.963046,1,15.571246,58.41428,1,3.08,2018-02-16 04:49:05+01:00,46.0,65.0,0.0,15.571012,58.414258,15.571012,58.414258,15.571012,58.414258,15.571012,15.571012,15.571012
train,2,287.399994,1,15.571079,58.414436,1,4.37,2018-02-16 04:49:17+01:00,34.0,65.0,0.0,15.571246,58.41428,15.571012,58.414258,15.571012,58.414258,15.571246,15.571012,15.571012
train,3,289.790009,1,15.570891,58.414459,1,6.73,2018-02-16 04:49:18+01:00,33.0,65.0,0.0,15.571079,58.414436,15.571246,58.41428,15.571012,58.414258,15.571079,15.571246,15.571012
train,4,288.200012,1,15.570773,58.414478,1,7.3,2018-02-16 04:49:19+01:00,32.0,65.0,0.0,15.570891,58.414459,15.571079,58.414436,15.571246,58.41428,15.570891,15.571079,15.571246


In [3]:
#df = pds.read_pickle('ANN_dataset.pkl')
#df.head()

Note that `time_left` is the label in this model 

In [4]:
label_string = 'time_left'

One hot encode segments, normalize columns and create final dataset

In [5]:
# Make segments categorical
dataset = pds.get_dummies(df['seg'])

# Combine segments, timestamps and time from journey start
mm = pp.MinMaxScaler()

# Cyclical timestamps 
dataset['hr_sin'] = np.sin(df.timestamp.dt.hour*(2.*np.pi/24))
dataset['hr_cos'] = np.cos(df.timestamp.dt.hour*(2.*np.pi/24))

# Convert to radians befor trigonometric functions
dataset['dir_sin'] = np.sin(df.direction*(np.pi/180))
dataset['dir_cos'] = np.cos(df.direction*(np.pi/180))

dataset['speed'] = (df['speed']-df['speed'].min())/(df['speed'].max()-df['speed'].min())
dataset['tsjs'] = (df['tsjs']-df['tsjs'].min())/(df['tsjs'].max()-df['tsjs'].min())

dataset['lat'] = (df['lat']-df['lat'].min())/(df['lat'].max()-df['lat'].min())
dataset['lon'] = (df['lon']-df['lon'].min())/(df['lon'].max()-df['lon'].min())

dataset['lat_one'] = mm.fit_transform(df.lat_one.values.reshape(-1,1))
dataset['lon_one'] = mm.fit_transform(df.lon_one.values.reshape(-1,1))

dataset['lat_two'] = mm.fit_transform(df.lat_two.values.reshape(-1,1))
dataset['lon_two'] = mm.fit_transform(df.lon_two.values.reshape(-1,1))

dataset['lat_three'] = mm.fit_transform(df.lat_three.values.reshape(-1,1))
dataset['lon_three'] = mm.fit_transform(df.lon_three.values.reshape(-1,1))

dataset['speed_one'] = mm.fit_transform(df.speed_one.values.reshape(-1,1))
dataset['speed_two'] = mm.fit_transform(df.speed_two.values.reshape(-1,1))
dataset['speed_three'] = mm.fit_transform(df.speed_three.values.reshape(-1,1))

# Include journey number to select entire journeys
dataset['journey'] = df['journey']

dataset.head()

Unnamed: 0,Unnamed: 1,1,2,3,4,5,6,7,8,9,10,...,lat_one,lon_one,lat_two,lon_two,lat_three,lon_three,speed_one,speed_two,speed_three,journey
train,0,1,0,0,0,0,0,0,0,0,0,...,0.327986,0.926504,0.328617,0.926504,0.329531,0.926504,0.327986,0.328617,0.329531,1
train,1,1,0,0,0,0,0,0,0,0,0,...,0.327986,0.926504,0.328617,0.926504,0.329531,0.926504,0.327986,0.328617,0.329531,1
train,2,1,0,0,0,0,0,0,0,0,0,...,0.332938,0.928719,0.328617,0.926504,0.329531,0.926504,0.332938,0.328617,0.329531,1
train,3,1,0,0,0,0,0,0,0,0,0,...,0.329406,0.944444,0.333579,0.928719,0.329531,0.926504,0.329406,0.333579,0.329531,1
train,4,1,0,0,0,0,0,0,0,0,0,...,0.325424,0.946743,0.330039,0.944444,0.334507,0.928719,0.325424,0.330039,0.334507,1


Select an arbitrary number of test journeys

divider = 0.2
num_journeys = dataset.journey.unique()[-1]
num_test_segments = np.int_(np.round(num_journeys*divider))
msk = np.random.randint(1, num_journeys, num_test_segments)
mask = dataset['journey'].isin(msk)

Divide training and test data, where data is a sequence from a journey

train_input = dataset[~mask]
test_input = dataset[mask]

train_labels = df[label_string][~mask]
test_labels = df[label_string][mask]

test_idxs = test_input.index

In [6]:
train_input = dataset.loc['train']
test_input = dataset.loc['test']

train_labels = df.loc['train'][label_string]
test_labels = df.loc['test'][label_string]

test_idxs = test_labels.index

In [7]:
# Journey numbers were needed up until this point
train_data = train_input.drop(columns=['journey'])
test_data = test_input.drop(columns=['journey'])

Build model

In [8]:
model = keras.Sequential([
	keras.layers.Dense(2*len(train_data.columns), activation=tf.nn.relu, input_shape=(train_data.shape[1],)),
    keras.layers.Dense(1*len(train_data.columns), activation=tf.nn.tanh),
    keras.layers.Dense(1*len(train_data.columns)),
	keras.layers.Dense(1, activation=tf.nn.relu)
	])

optimizer = keras.optimizers.Adadelta()
model.compile(loss='mae', optimizer = optimizer, metrics=['mae'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 56)                1624      
_________________________________________________________________
dense_1 (Dense)              (None, 28)                1596      
_________________________________________________________________
dense_2 (Dense)              (None, 28)                812       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 29        
Total params: 4,061
Trainable params: 4,061
Non-trainable params: 0
_________________________________________________________________
None


Fit model, 10% of the training data is used as validation set

In [9]:
EPOCHS=10
result = model.fit(train_data.values, 
                   train_labels.values, 
                   epochs = EPOCHS, 
                   validation_split=0.1, 
                   initial_epoch=0,
                   batch_size=32
                  )

Train on 1354825 samples, validate on 150537 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Evaluate model on test data

In [10]:
model.evaluate(test_data.values, test_labels.values)



[3.301505398718031, 3.301505398718031]

Prepare data for evaluation step

In [11]:
predictions = model.predict(test_data.values)

### Create dataset for evaluation

In [12]:
segment = df['seg'].loc['test'].reset_index(drop=True)

In [13]:
output = pds.DataFrame()
output['seg'] = segment
output['journey'] = test_input['journey'].values
output['speed'] = test_data.speed.values
output['pred'] = predictions
output['label'] = test_labels.values
output.head()

Unnamed: 0,seg,journey,speed,pred,label
0,1,12,0.199897,50.963905,55.0
1,1,12,0.250902,49.40942,46.0
2,1,12,0.289026,38.02076,37.0
3,1,12,0.329212,37.518921,36.0
4,1,12,0.371458,36.415882,35.0


In [14]:
# Same name for all files
model_name = 'ANN_M3_downsampled'

# serialize model to YAML
model_yaml = model.to_yaml()
with open(model_name + '.yaml', 'w') as yaml_file:
    yaml_file.write(model_yaml)

# Serialize weights to HDF5
model.save_weights(model_name + '.h5')

In [15]:
output.to_pickle(model_name + '.pkl')

Done! Get ready for evaluation