In [1]:
# Setup of libraries, mounting the Google Drive etc.

import os
import sys
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('/content/gdrive/My Drive/Colab Notebooks')

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf

import utils

print('The current working directory is:', os.getcwd())

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
The current working directory is: /content/gdrive/My Drive/Colab Notebooks


## Read the dataset from `.pkl` file


In [2]:
df = pd.read_pickle('datasets/cabspotting.pkl')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,occupied
user,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abboip,2008-05-17 14:12:10,37.75153,-122.39447,0
abboip,2008-05-17 14:13:34,37.75149,-122.39447,0
abboip,2008-05-17 14:14:34,37.75149,-122.39447,0
abboip,2008-05-17 14:15:35,37.75149,-122.39446,0
abboip,2008-05-17 14:41:43,37.75144,-122.39449,0


## Convert the dataset to a mapping of users to the strings of their movements

In [3]:
df = utils.records_to_user_geolcation(df)
df.head()

user
abboip      [[15126.000000000002, -122.39447, 37.751529999...
abcoij      [[47486.0, -122.41466000000001, 37.80346], [47...
abdremlu    [[11949.0, -122.39093000000001, 37.75521], [12...
abgibo      [[16.0, -122.4374, 37.7733], [61.0000000000000...
abjoolaw    [[13856.0, -122.39747, 37.75159], [13916.0, -1...
dtype: object

## Split the data into training and test/validation sets
This has to be done by user in order for the training data to be truly unknown to both the generator and the discriminator.

In [4]:
len(df)

536

In [5]:
import sklearn.model_selection

df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8)

print('Train:', len(df_train))
print('Test:', len(df_test))

Train: 428
Test: 108




## Prepare the training data for the generator
The generator gets any one randomly selected point from the training data as the starting point for the trajectory it generates during training. This may later be replaced by a point on a 3-dimensional distribution fitted to the distribution of the real data.

In [7]:
# Make the single sequence of the training data
data_train = utils.user_geolocation_to_single_sequence(df_train)

print(data_train.shape)
print()
print(data_train)

(9000109, 3)

[[ 7.9000000e+01 -1.2243825e+02  3.7777420e+01]
 [ 1.4600000e+02 -1.2243874e+02  3.7779740e+01]
 [ 1.9900000e+02 -1.2244096e+02  3.7785140e+01]
 ...
 [ 2.0187740e+06 -1.2240682e+02  3.7785480e+01]
 [ 2.0188350e+06 -1.2240654e+02  3.7785650e+01]
 [ 2.0188950e+06 -1.2240344e+02  3.7783600e+01]]


## Create the scaler on the training data
The sequence for the generator is used to fit a `StandardScaler` to the training data.
This scaler should later be saved alongside the trained model, as the scaler should only be fitted to the training data and the training data changes on every execution of this Notebook.

In [8]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
scaler.fit(data_train)

print('Means: %f %f %f' % (scaler.mean_[0], scaler.mean_[1], scaler.mean_[2]))
print('Standard deviations: %f %f %f' % (scaler.var_[0], scaler.var_[1], scaler.var_[2]))

Means: 1019312.130780 -122.412544 37.763994
Standard deviations: 345493530496.693298 0.001275 0.002786


## Build the generator

In [0]:
def build_generator():
    model = tf.keras.Sequential([
        tf.keras.layers.GRU(256, return_sequences=True, stateful=True, input_shape=(None, 3), batch_size=1),
        tf.keras.layers.TimeDistributed(tf.layers.Dense(3))
    ])
    return model

In [13]:
generator = build_generator()
generator.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (1, None, 256)            199680    
_________________________________________________________________
time_distributed_1 (TimeDist (1, None, 3)              771       
Total params: 200,451
Trainable params: 200,451
Non-trainable params: 0
_________________________________________________________________


## Build the discriminator

In [0]:
def build_discriminator():
    model = tf.keras.Sequential([
        tf.keras.layers.GRU(128, input_shape=(None, 3)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    return model

In [17]:
discriminator = build_discriminator()
discriminator.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_3 (GRU)                  (None, 128)               50688     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 50,817
Trainable params: 50,817
Non-trainable params: 0
_________________________________________________________________


## Put the adversarial model together

In [0]:
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [0]:
discriminator.trainable = False

adversarial = tf.keras.Sequential([
    generator,
    discriminator
])

adversarial.compile(optimizer='adam', loss='binary_crossentropy')

In [20]:
adversarial.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_1 (Sequential)    (1, None, 3)              200451    
_________________________________________________________________
sequential_2 (Sequential)    (None, 1)                 50817     
Total params: 251,268
Trainable params: 200,451
Non-trainable params: 50,817
_________________________________________________________________


## Training

## Save and plot the history
The history holds information on loss and validiation loss during training.

## Load and prepare the trained model for generating data

## Generate geolocation data of one entity using the trained model
In the end the generated data is saved to disk in order to be reused from a different notebook.

In [0]:
N_GENERATE = 1000

start = [324.0, -122.390085, 37.789910]    # Google Offices in San Francisco: 37.789910, -122.390085
generated = np.array([start]).astype(np.float32)

model.reset_states()

for i in range(1, N_GENERATE):
    input = generated[-1:]
    input = scaler.transform(input)
    input = np.array([input])
    
    prediction = model.predict(input, batch_size=1)
    
    prediction = np.squeeze(prediction, axis=0)
    prediction = scaler.inverse_transform(prediction)
    
    generated = np.concatenate([generated, prediction])

In [0]:
generated.shape

In [0]:
generated

In [0]:
with open('generated/citybased_gan/test_00.pkl', 'wb') as file:
    pickle.dump(generated, file)
    print('Generated data saved')