# BigQuery-Geotab exploaration and experimentation
*Anders Poirel*

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
TRAIN_PATH = '../data/raw/train.csv'
TEST_PATH = '../data/raw/test.csv'
OUTPUT_PATH = ''

# Data Exploration

In [3]:
train = pd.read_csv(TRAIN_PATH)

In [4]:
test = pd.read_csv(TEST_PATH)

Take a first look at the data

In [31]:
train.head(10)

Unnamed: 0,RowId,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,...,TimeFromFirstStop_p40,TimeFromFirstStop_p50,TimeFromFirstStop_p60,TimeFromFirstStop_p80,DistanceToFirstStop_p20,DistanceToFirstStop_p40,DistanceToFirstStop_p50,DistanceToFirstStop_p60,DistanceToFirstStop_p80,City
0,1920335,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
1,1920336,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
2,1920337,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
3,1920338,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
4,1920339,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,2,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
5,1920340,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,2,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
6,1920341,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,3,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
7,1920342,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,3,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
8,1920343,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,4,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
9,1920344,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,4,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta


In [None]:
train.columns.values

In [None]:
test.columns.values

In [None]:
train['IntersectionId'].unique()

In [None]:
train['City'].unique()

In [None]:
train['Path'].unique()

## Pre-processing

We need to pre-process features in a way which will be understood by TensorFlow.
i.e. `feature_columns` in our model declaration should be an iterable containing all the feature columns used by the model

In [32]:
X_train =  train.iloc[:, 1:12]

In [36]:
X_train.head(10)

Unnamed: 0,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,Month,Path
0,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,0,0,6,Marietta Boulevard Northwest_NW_Marietta Boule...
1,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,0,0,6,Marietta Boulevard Northwest_SE_Marietta Boule...
2,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,1,0,6,Marietta Boulevard Northwest_NW_Marietta Boule...
3,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,1,0,6,Marietta Boulevard Northwest_SE_Marietta Boule...
4,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,2,0,6,Marietta Boulevard Northwest_NW_Marietta Boule...
5,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,2,0,6,Marietta Boulevard Northwest_SE_Marietta Boule...
6,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,3,0,6,Marietta Boulevard Northwest_NW_Marietta Boule...
7,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,3,0,6,Marietta Boulevard Northwest_SE_Marietta Boule...
8,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,4,0,6,Marietta Boulevard Northwest_NW_Marietta Boule...
9,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,4,0,6,Marietta Boulevard Northwest_SE_Marietta Boule...


We fill in the missing values

In [12]:
X_train['EntryStreetName'].fillna('Entry NA', inplace = True)
X_train['ExitStreetName'].fillna('Exit NA', inplace = True)

In [14]:
X_train.isnull().sum()

IntersectionId     0
Latitude           0
Longitude          0
EntryStreetName    0
ExitStreetName     0
EntryHeading       0
ExitHeading        0
Hour               0
Weekend            0
Month              0
Path               0
dtype: int64

In [15]:
CATEGORICAL_COLUMNS = ['EntryStreetName', 'ExitStreetName', 'EntryHeading', 'ExitHeading', 
                        'Hour','Month', 'Path', 'Weekend']
NUMERIC_COLUMNS = ['Latitude', 'Longitude']

We define the feature columns for use with the TensorFlow `estimator` API

In [24]:
def one_hot_cat_column(feature_name, vocab):
    return tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocab))

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = X_train[feature_name].unique()
    feature_columns.append(one_hot_cat_column(feature_name, vocabulary))
    
for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name,
                                                            dtype = tf.float32))

In [17]:
y_train_1 = train['TotalTimeStopped_p20']
y_train_2 = train['TotalTimeStopped_p50']
y_train_3 = train['TotalTimeStopped_p80']

We build a dataloader for use by tensorflow:

In [18]:
def make_input(X, y, n_epochs = None, shuffle = False):
    def input_f():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)) 
        if shuffle:
            dataset = dataset.shuffle(len(y))
        dataset = dataset.repeat(n_epochs)
        dataset = dataset.batch(len(y))
        return dataset
    return input_f

# Initial model

In [19]:
import tensorflow as tf
import hyperopt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.estimator import BoostedTreesRegressor

We can train a basic linear regression model in tensorflow:

In [27]:
model_linear = tf.estimator.LinearRegressor(feature_columns)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Anders\\AppData\\Local\\Temp\\tmpdgepkkwv', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001CD9DBD4A88>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [28]:
input_fn = make_input(X_train, y_train_1)

In [29]:
model_linear.train(input_fn, max_steps = 10)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
Use `tf.cast` instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:D

ResourceExhaustedError: OOM when allocating tensor with shape[857409,1,15111] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[node linear/linear_model/linear/linear_model/linear/linear_model/Path_indicator/one_hot (defined at C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\framework\ops.py:1751) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Original stack trace for 'linear/linear_model/linear/linear_model/linear/linear_model/Path_indicator/one_hot':
  File "C:\Users\Anders\Miniconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\Anders\Miniconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\Anders\Miniconda3\lib\site-packages\traitlets\config\application.py", line 664, in launch_instance
    app.start()
  File "C:\Users\Anders\Miniconda3\lib\site-packages\ipykernel\kernelapp.py", line 563, in start
    self.io_loop.start()
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tornado\platform\asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\Anders\Miniconda3\lib\asyncio\base_events.py", line 534, in run_forever
    self._run_once()
  File "C:\Users\Anders\Miniconda3\lib\asyncio\base_events.py", line 1771, in _run_once
    handle._run()
  File "C:\Users\Anders\Miniconda3\lib\asyncio\events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tornado\gen.py", line 787, in inner
    self.run()
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tornado\gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\ipykernel\kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\ipykernel\kernelbase.py", line 541, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\ipykernel\ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2855, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2881, in _run_cell
    return runner(coro)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3058, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3249, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "C:\Users\Anders\Miniconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-29-d3a66fd3aab5>", line 1, in <module>
    model_linear.train(input_fn, max_steps = 10)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 370, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1160, in _train_model
    return self._train_model_default(input_fn, hooks, saving_listeners)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1190, in _train_model_default
    features, labels, ModeKeys.TRAIN, self.config)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_estimator\python\estimator\estimator.py", line 1148, in _call_model_fn
    model_fn_results = self._model_fn(features=features, **kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\linear.py", line 1347, in _model_fn
    sparse_combiner=sparse_combiner)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\linear.py", line 674, in _linear_model_fn_v2
    features=features)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_estimator\python\estimator\canned\linear.py", line 607, in _linear_model_fn_builder_v2
    logits = linear_model(features)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py", line 847, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\autograph\impl\api.py", line 234, in wrapper
    return converted_call(f, options, args, kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\autograph\impl\api.py", line 439, in converted_call
    return _call_unconverted(f, args, kwargs, options)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\autograph\impl\api.py", line 330, in _call_unconverted
    return f(*args, **kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\feature_column\feature_column_v2.py", line 706, in call
    return self.layer(features)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\keras\engine\base_layer.py", line 847, in __call__
    outputs = call_fn(cast_inputs, *args, **kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\autograph\impl\api.py", line 234, in wrapper
    return converted_call(f, options, args, kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\autograph\impl\api.py", line 439, in converted_call
    return _call_unconverted(f, args, kwargs, options)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\autograph\impl\api.py", line 330, in _call_unconverted
    return f(*args, **kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\feature_column\feature_column_v2.py", line 540, in call
    weight_var=weight_var)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\feature_column\feature_column_v2.py", line 2411, in _create_weighted_sum
    weight_var=weight_var)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\feature_column\feature_column_v2.py", line 2417, in _create_dense_column_weighted_sum
    tensor = column.get_dense_tensor(transformation_cache, state_manager)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\feature_column\feature_column_v2.py", line 4360, in get_dense_tensor
    return transformation_cache.get(self, state_manager)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\feature_column\feature_column_v2.py", line 2608, in get
    transformed = column.transform_feature(self, state_manager)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\feature_column\feature_column_v2.py", line 4300, in transform_feature
    return self._transform_id_weight_pair(id_weight_pair)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\feature_column\feature_column_v2.py", line 4278, in _transform_id_weight_pair
    off_value=0.0)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\ops\array_ops.py", line 3514, in one_hot
    name)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\ops\gen_array_ops.py", line 6137, in one_hot
    off_value=off_value, axis=axis, name=name)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\framework\op_def_library.py", line 793, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3360, in create_op
    attrs, op_def, compute_device)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\framework\ops.py", line 3429, in _create_op_internal
    op_def=op_def)
  File "C:\Users\Anders\Miniconda3\lib\site-packages\tensorflow_core\python\framework\ops.py", line 1751, in __init__
    self._traceback = tf_stack.extract_stack()


The initial one-hot encoding seems to result in an out of memory error (as this results in 15,111) different feature columns, so we need some other way of encoding our features so as to make the problem manageable on my laptop's 16 GB memory.

### What to do for HyperOpt
Describe
- objective to minimize
- space over which to search
- database to store point evaluations
- search algorithm to use

#### Objective function

We now need to define the objective for `hyperopt` to optimize
We only build one objective fucntion, but we will be optimizing 3 models overall, one each for the 20th, 50th and 80th percentiles of waiting times

In [None]:
def rmse_cv_score(X, y, n_folds, kwargs, feature_columns):
    scores = []
    model = BoostedTreesRegressor(feature_columns, n_batches_per_layer = 1, **kwargs)

    for train_index, val_index in KFold(n_splits = 5).split(X):
        
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        train_input = make_input(X_train, y_train)
        val_input = make_input(X_val, y_val)
        
        model.train(train_input, max_steps = 1)
        scores.append(model.evaluate())
        
    return np.mean(scores)

## Cross - Validation test

In [None]:
iris = sns.load_dataset('iris')
iris_r = pd.get_dummies(iris, ['species'])
X = iris_r.drop('sepal_width', axis = 1)
y = iris['sepal_width']

In [None]:
# k-fold CV tramplate when not using scikit learn estimator
# (if we had access to a scikit learn estimator we could directly use cross_validate)

scores = []
model = LinearRegression()
for train_index, val_index in KFold(n_splits = 5).split(X, y):
    X_train, X_val = X[train_index], X[val_index] 
    y_train, y_val = y[train_index], y[val_index]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    scores.append(np.sqrt(mean_squared_error(y_val, y_pred)))
                                                        