In [1]:
# Setup of libraries, mounting the Google Drive etc.

import os
import sys
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('gdrive/My Drive/Colab Notebooks')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
tf.enable_eager_execution()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Read the dataset from `.pkl` file


In [2]:
df = pd.read_pickle('cabspotting.pkl')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,occupied
user,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abboip,2008-05-17 14:12:10,37.75153,-122.39447,0
abboip,2008-05-17 14:13:34,37.75149,-122.39447,0
abboip,2008-05-17 14:14:34,37.75149,-122.39447,0
abboip,2008-05-17 14:15:35,37.75149,-122.39446,0
abboip,2008-05-17 14:41:43,37.75144,-122.39449,0


## Convert the dataset to a mapping of users to the strings of their movements

In [6]:
import datetime as dt

df.reset_index(inplace=True)

df['rebased_time'] = df['time'] - df['time'].min()
df['rebased_time'] = df['rebased_time'].dt.total_seconds()

df = df[['user', 'rebased_time', 'longitude', 'latitude']]

def extract_sequence(df):
    df.drop('user', axis=1, inplace=True)
    df.sort_values(by='rebased_time', inplace=True)
    return df.values

df = df.groupby('user').apply(extract_sequence)

df.head()

user
abboip      [[15126.000000000002, -122.39447, 37.751529999...
abcoij      [[47486.0, -122.41466000000001, 37.80346], [47...
abdremlu    [[11949.0, -122.39093000000001, 37.75521], [12...
abgibo      [[16.0, -122.4374, 37.7733], [61.0000000000000...
abjoolaw    [[13856.0, -122.39747, 37.75159], [13916.0, -1...
dtype: object

## Concatenate all user strings of data to one complete string of all

In [7]:
def to_single_sequence(series):
    data = None
    for _, item in series.iteritems():
        if data is None:
            data = item
        else:
            data = np.concatenate((data, item), axis=0)
    
    return data

data = to_single_sequence(df)
del df

print(data.shape)
print()
print(data)

(11219955, 3)

[[ 1.5126000e+04 -1.2239447e+02  3.7751530e+01]
 [ 1.5210000e+04 -1.2239447e+02  3.7751490e+01]
 [ 1.5270000e+04 -1.2239447e+02  3.7751490e+01]
 ...
 [ 2.0172080e+06 -1.2244239e+02  3.7756860e+01]
 [ 2.0172640e+06 -1.2244215e+02  3.7760470e+01]
 [ 2.0173250e+06 -1.2243687e+02  3.7760750e+01]]


## Standardisation
All three features in the data are scaled to have a `mean = 0` and a `standard deviation = 1`.

In [8]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(data)

data = scaler.transform(data)

print('Means: %f %f %f' % (scaler.mean_[0], scaler.mean_[1], scaler.mean_[2]))
print('Standard deviations: %f %f %f' % (scaler.var_[0], scaler.var_[1], scaler.var_[2]))
print()
print(data)

Means: 1017983.546543 -122.412432 37.763601
Standard deviations: 346406383099.499146 0.001280 0.002901

[[-1.70390864  0.50197561 -0.22410877]
 [-1.70376591  0.50197561 -0.22485143]
 [-1.70366397  0.50197561 -0.22485143]
 ...
 [ 1.69773582 -0.83724035 -0.12514886]
 [ 1.69783096 -0.83053309 -0.05812348]
 [ 1.69793461 -0.6829734  -0.05292484]]


## Convert the data to 32-bit integers
This is to correct an error I got earlier. Apparently the GRU layers can't handle 64-bit integers (doubles) as input.

In [11]:
print('Before: ', data.dtype)

Before:  float64


In [0]:
data = data.astype(np.float32)

In [14]:
print('After: ', data.dtype)

After:  float32


## Split the data into a set for training and one for testing
The split happens along the complete string of all concatenated user movement data.

In [15]:
TRAIN_TEST_SPLIT = 0.8

n_train = int(np.floor(TRAIN_TEST_SPLIT * data.shape[0]))

train_data = data[0:n_train]
test_data = data[n_train:]

train_data.shape, test_data.shape

((8975964, 3), (2243991, 3))

## Instantiate TensorFlow Dataset from the data

In [16]:
SEQ_LENGTH = 100
examples_per_epoch = len(train_data) // SEQ_LENGTH

# Create training examples / targets
dataset = tf.data.Dataset.from_tensor_slices(train_data)

for rec in dataset.take(5):
    print(rec.numpy())

[-1.7039087   0.5019756  -0.22410877]
[-1.7037659   0.5019756  -0.22485143]
[-1.703664    0.5019756  -0.22485143]
[-1.7035604   0.5022551  -0.22485143]
[-1.7008963   0.5014167  -0.22577976]


## Split into sequences

In [17]:
sequences = dataset.batch(SEQ_LENGTH + 1, drop_remainder=True)

for seq in sequences.take(5):
    print(seq.numpy()[0], seq.numpy()[1], seq.numpy()[2],' ...')

[-1.7039087   0.5019756  -0.22410877] [-1.7037659   0.5019756  -0.22485143] [-1.703664    0.5019756  -0.22485143]  ...
[-1.6920493  0.661273  -1.6295995] [-1.6919813  0.6120864 -1.3839635] [-1.691898   0.5290841 -1.1390702]  ...
[-1.6829762  -0.03460507 -0.27702355] [-1.6828743  -0.0415918  -0.23710538] [-1.6827741  -0.05277057 -0.20127188]  ...
[-1.6731405  0.4949889  0.2482252] [-1.6730623   0.49331206  0.24673986] [-1.6730607   0.49359155  0.24673986]  ...
[-1.6638908  -0.6522318   0.48513484] [-1.6638585 -0.6519523  0.4816072] [-1.6637549 -0.6418914  0.451715 ]  ...


## Make input and output sequences
... by dropping the first or the last record respectively.

In [0]:
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    
    return input_seq, target_seq

dataset = sequences.map(split_input_target)

In [19]:
for input_example, target_example in dataset.take(1):
    print('Input data: ', input_example.numpy()[0], input_example.numpy()[1], input_example.numpy()[2],' ...')
    print('Target data:', target_example.numpy()[0], target_example.numpy()[1], target_example.numpy()[2], ' ...')

Input data:  [-1.7039087   0.5019756  -0.22410877] [-1.7037659   0.5019756  -0.22485143] [-1.703664    0.5019756  -0.22485143]  ...
Target data: [-1.7037659   0.5019756  -0.22485143] [-1.703664    0.5019756  -0.22485143] [-1.7035604   0.5022551  -0.22485143]  ...


In [20]:
for i, (input_rec, target_rec) in enumerate(zip(input_example[:5], target_example[:5])):
    print('Step {:4d}'.format(i))
    print('  input: ', input_rec.numpy())
    print('  expected output: ', target_rec.numpy())

Step    0
  input:  [-1.7039087   0.5019756  -0.22410877]
  expected output:  [-1.7037659   0.5019756  -0.22485143]
Step    1
  input:  [-1.7037659   0.5019756  -0.22485143]
  expected output:  [-1.703664    0.5019756  -0.22485143]
Step    2
  input:  [-1.703664    0.5019756  -0.22485143]
  expected output:  [-1.7035604   0.5022551  -0.22485143]
Step    3
  input:  [-1.7035604   0.5022551  -0.22485143]
  expected output:  [-1.7008963   0.5014167  -0.22577976]
Step    4
  input:  [-1.7008963   0.5014167  -0.22577976]
  expected output:  [-1.7007756  0.5002988 -0.2244801]


## Shuffle and split the dataset into batches

In [21]:
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch // BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 100, 3), (64, 100, 3)), types: (tf.float32, tf.float32)>

## Definition of how to build the model
The first cell selects the type of GRU layer to use depending on wether the algorithm is run on a CPU or GPU.

In [22]:
if tf.test.is_gpu_available():
    my_gru = tf.keras.layers.CuDNNGRU
    print('Selected GRU layer for GPU')
else:
    import functools
    my_gru = functools.partial(
        tf.keras.layers.GRU, recurrent_activation='sigmoid')
    print('Selected GRU layer for CPU')

Selected GRU layer for GPU


In [0]:
def build_model(internal_units, batch_size):
    model = tf.keras.Sequential([
        my_gru(internal_units, return_sequences=True, recurrent_initializer='glorot_uniform', stateful=True, batch_input_shape=[batch_size, None, 3]),
        tf.keras.layers.Dense(3)
    ])
    return model

## Build the model

In [24]:
INTERNAL_UNITS = 1024

model = build_model(internal_units=INTERNAL_UNITS, batch_size=BATCH_SIZE)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnngru (CuDNNGRU)         (64, None, 1024)          3161088   
_________________________________________________________________
dense (Dense)                (64, None, 3)             3075      
Total params: 3,164,163
Trainable params: 3,164,163
Non-trainable params: 0
_________________________________________________________________


## Test the model's output (before training)

In [25]:
model.input

<DeferredTensor 'cu_dnngru_input' shape=(64, ?, 3) dtype=float32>

In [27]:
for input_example_batch, target_example_batch in dataset.take(1): 
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, n_features)")

(64, 100, 3) # (batch_size, sequence_length, n_features)


In [32]:
scaler.inverse_transform(example_batch_predictions[0].numpy()[:5])

array([[ 1.01066512e+06, -1.22413895e+02,  3.77626343e+01],
       [ 1.01131806e+06, -1.22413834e+02,  3.77628174e+01],
       [ 1.01227206e+06, -1.22413780e+02,  3.77629433e+01],
       [ 1.01372819e+06, -1.22413666e+02,  3.77630997e+01],
       [ 1.01519075e+06, -1.22413635e+02,  3.77631416e+01]], dtype=float32)

In [37]:
print('Loss: ', tf.keras.losses.mean_absolute_error(target_example_batch, example_batch_predictions).numpy().mean())

Loss:  0.6429231


## Training

In [0]:
model.compile(optimizer=tf.train.AdamOptimizer(),
              loss=tf.keras.losses.mean_absolute_error)

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './onegru_training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}_{loss}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix,
                                                         save_weights_only=True)

In [42]:
EPOCHS = 30

history = model.fit(dataset.repeat(),
                    epochs=EPOCHS,
                    steps_per_epoch=steps_per_epoch,
                    val
                    callbacks=[checkpoint_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30

KeyboardInterrupt: ignored