In [1]:
# Setup of libraries, mounting the Google Drive etc.

import os
import sys
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('/content/gdrive/My Drive/Colab Notebooks')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf

import utils

print('The current working directory is:', os.getcwd())

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
The current working directory is: /content/gdrive/My Drive/Colab Notebooks


## Read the dataset from `.pkl` file


In [2]:
df = pd.read_pickle('datasets/cabspotting.pkl')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,occupied
user,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abboip,2008-05-17 14:12:10,37.75153,-122.39447,0
abboip,2008-05-17 14:13:34,37.75149,-122.39447,0
abboip,2008-05-17 14:14:34,37.75149,-122.39447,0
abboip,2008-05-17 14:15:35,37.75149,-122.39446,0
abboip,2008-05-17 14:41:43,37.75144,-122.39449,0


## Convert the dataset to a mapping of users to the strings of their movements

In [3]:
df = utils.records_to_user_geolcation(df)
df.head()

user
abboip      [[15126.000000000002, -122.39447, 37.751529999...
abcoij      [[47486.0, -122.41466000000001, 37.80346], [47...
abdremlu    [[11949.0, -122.39093000000001, 37.75521], [12...
abgibo      [[16.0, -122.4374, 37.7733], [61.0000000000000...
abjoolaw    [[13856.0, -122.39747, 37.75159], [13916.0, -1...
dtype: object

## Split the data into training and test/validation sets
This has to be done by user in order for the training data to be truly unknown to both the generator and the discriminator.

In [5]:
len(df)

536

In [9]:
import sklearn.model_selection

df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8)

print('Train:', len(df_train))
print('Test:', len(df_test))

Train: 428
Test: 108




## Prepare the training data for the generator
The generator gets one sequence of all training trajectories concatenated, in the same way the `citybased_rnn` does. The only difference to the `citybased_rnn` is that there is no validation data für validation during training.

In [10]:
# Make the single sequence of the training data
data = utils.user_geolocation_to_single_sequence(df_train)

print(data.shape)
print()
print(data)

(9001949, 3)

[[ 7.9940000e+03 -1.2239491e+02  3.7751930e+01]
 [ 8.0550000e+03 -1.2239512e+02  3.7751340e+01]
 [ 8.1150000e+03 -1.2239527e+02  3.7749900e+01]
 ...
 [ 9.3709900e+05 -1.2239495e+02  3.7751830e+01]
 [ 9.3716500e+05 -1.2239496e+02  3.7751830e+01]
 [ 9.3721900e+05 -1.2239499e+02  3.7751830e+01]]


## Build the generator

In [0]:
def build_generator():
    model = tf.keras.Sequential([
        tf.keras.layers.GRU(1024, return_sequences=True, stateful=True, input_shape=(None, 3), batch_size=1),
        tf.keras.layers.TimeDistributed(tf.layers.Dense(3))
    ])
    return model

In [0]:
generator = build_generator()
generator.summary()

## Build the discriminator

In [0]:
def build_discriminator():
    model = tf.keras.Sequential([
        tf.keras.layers.GRU(1024, input_shape=(None, 3)),
        tf.keras.Dense(1, activation='sigmoid')
    ])
    return model

In [0]:
discriminator = build_discriminator()
discriminator.summary()

## Put the adversarial model together

In [0]:
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
discriminator.trainable = False

adversarial = tf.keras.Sequential([
    generator,
    discriminator
])

adversarial.compile(optimizer='adam', loss='binary_crossentropy')

## Training

## Save and plot the history
The history holds information on loss and validiation loss during training.

## Load and prepare the trained model for generating data

## Generate geolocation data of one entity using the trained model
In the end the generated data is saved to disk in order to be reused from a different notebook.

In [0]:
N_GENERATE = 1000

start = [324.0, -122.390085, 37.789910]    # Google Offices in San Francisco: 37.789910, -122.390085
generated = np.array([start]).astype(np.float32)

model.reset_states()

for i in range(1, N_GENERATE):
    input = generated[-1:]
    input = scaler.transform(input)
    input = np.array([input])
    
    prediction = model.predict(input, batch_size=1)
    
    prediction = np.squeeze(prediction, axis=0)
    prediction = scaler.inverse_transform(prediction)
    
    generated = np.concatenate([generated, prediction])

In [0]:
generated.shape

In [0]:
generated

In [0]:
with open('generated/citybased_gan/test_00.pkl', 'wb') as file:
    pickle.dump(generated, file)
    print('Generated data saved')