# BigQuery-Geotab exploaration and experimentation
*Anders Poirel*

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
TRAIN_PATH = '../data/raw/train.csv'
TEST_PATH = '../data/raw/test.csv'
OUTPUT_PATH = ''

# Data Exploration

In [3]:
train = pd.read_csv(TRAIN_PATH)

In [8]:
test = pd.read_csv(TEST_PATH)

Take a first look at the data

In [4]:
train.head(10)

Unnamed: 0,RowId,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,...,TimeFromFirstStop_p40,TimeFromFirstStop_p50,TimeFromFirstStop_p60,TimeFromFirstStop_p80,DistanceToFirstStop_p20,DistanceToFirstStop_p40,DistanceToFirstStop_p50,DistanceToFirstStop_p60,DistanceToFirstStop_p80,City
0,1920335,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
1,1920336,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,0,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
2,1920337,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
3,1920338,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,1,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
4,1920339,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,2,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
5,1920340,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,2,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
6,1920341,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,3,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
7,1920342,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,3,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
8,1920343,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,4,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta
9,1920344,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,4,0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,Atlanta


In [5]:
train.columns.values

array(['RowId', 'IntersectionId', 'Latitude', 'Longitude',
       'EntryStreetName', 'ExitStreetName', 'EntryHeading', 'ExitHeading',
       'Hour', 'Weekend', 'Month', 'Path', 'TotalTimeStopped_p20',
       'TotalTimeStopped_p40', 'TotalTimeStopped_p50',
       'TotalTimeStopped_p60', 'TotalTimeStopped_p80',
       'TimeFromFirstStop_p20', 'TimeFromFirstStop_p40',
       'TimeFromFirstStop_p50', 'TimeFromFirstStop_p60',
       'TimeFromFirstStop_p80', 'DistanceToFirstStop_p20',
       'DistanceToFirstStop_p40', 'DistanceToFirstStop_p50',
       'DistanceToFirstStop_p60', 'DistanceToFirstStop_p80', 'City'],
      dtype=object)

In [9]:
test.columns.values

array(['RowId', 'IntersectionId', 'Latitude', 'Longitude',
       'EntryStreetName', 'ExitStreetName', 'EntryHeading', 'ExitHeading',
       'Hour', 'Weekend', 'Month', 'Path', 'City'], dtype=object)

In [9]:
train['IntersectionId'].unique()

array([   0,    1,    2, ..., 1951, 1953, 1974], dtype=int64)

In [10]:
train['City'].unique()

array(['Atlanta', 'Boston', 'Chicago', 'Philadelphia'], dtype=object)

In [11]:
train['Path'].unique()

array(['Marietta Boulevard Northwest_NW_Marietta Boulevard Northwest_NW',
       'Marietta Boulevard Northwest_SE_Marietta Boulevard Northwest_SE',
       'Unknown_NE_Marietta Boulevard Northwest_SE', ...,
       'Crescent Drive_N_League Island Boulevard_W',
       'South 4th Street_S_Catharine Street_W',
       'Catharine Street_W_South 4th Street_S'], dtype=object)

## Pre-processing

## Some time-related features

In [14]:
X_train =  train.iloc[:, 1:12]

In [33]:
X_train.iloc[1000:1040, :]

Unnamed: 0,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,Month,Path
1000,1,33.75094,-84.39303,Peachtree Street Southwest,Peachtree Street Southwest,NE,NE,15,1,12,Peachtree Street Southwest_NE_Peachtree Street...
1001,1,33.75094,-84.39303,Peachtree Street Southwest,Mitchell Street Southwest,SW,SE,15,1,12,Peachtree Street Southwest_SW_Mitchell Street ...
1002,1,33.75094,-84.39303,Peachtree Street Southwest,Mitchell Street Southwest,NE,SE,16,1,12,Peachtree Street Southwest_NE_Mitchell Street ...
1003,1,33.75094,-84.39303,Mitchell Street Southwest,Mitchell Street Southwest,SE,SE,17,1,12,Mitchell Street Southwest_SE_Mitchell Street S...
1004,1,33.75094,-84.39303,Peachtree Street Southwest,Peachtree Street Southwest,SW,SW,17,1,12,Peachtree Street Southwest_SW_Peachtree Street...
1005,1,33.75094,-84.39303,Peachtree Street Southwest,Peachtree Street Southwest,NE,NE,18,1,12,Peachtree Street Southwest_NE_Peachtree Street...
1006,1,33.75094,-84.39303,Peachtree Street Southwest,Peachtree Street Southwest,SW,SW,20,1,12,Peachtree Street Southwest_SW_Peachtree Street...
1007,1,33.75094,-84.39303,Mitchell Street Southwest,Mitchell Street Southwest,SE,SE,21,1,12,Mitchell Street Southwest_SE_Mitchell Street S...
1008,1,33.75094,-84.39303,Mitchell Street Southwest,Mitchell Street Southwest,SE,SE,23,1,12,Mitchell Street Southwest_SE_Mitchell Street S...
1009,1,33.75094,-84.39303,Peachtree Street Southwest,Mitchell Street Southwest,SW,SE,23,1,12,Peachtree Street Southwest_SW_Mitchell Street ...


In [34]:
X_train.fillna('None')

Unnamed: 0,IntersectionId,Latitude,Longitude,EntryStreetName,ExitStreetName,EntryHeading,ExitHeading,Hour,Weekend,Month,Path
0,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,0,0,6,Marietta Boulevard Northwest_NW_Marietta Boule...
1,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,0,0,6,Marietta Boulevard Northwest_SE_Marietta Boule...
2,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,1,0,6,Marietta Boulevard Northwest_NW_Marietta Boule...
3,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,SE,SE,1,0,6,Marietta Boulevard Northwest_SE_Marietta Boule...
4,0,33.79166,-84.43003,Marietta Boulevard Northwest,Marietta Boulevard Northwest,NW,NW,2,0,6,Marietta Boulevard Northwest_NW_Marietta Boule...
...,...,...,...,...,...,...,...,...,...,...,...
857404,1990,39.93823,-75.14993,South 4th Street,South 4th Street,S,S,19,1,12,South 4th Street_S_South 4th Street_S
857405,1990,39.93823,-75.14993,Catharine Street,Catharine Street,W,W,20,1,12,Catharine Street_W_Catharine Street_W
857406,1990,39.93823,-75.14993,South 4th Street,Catharine Street,S,W,20,1,12,South 4th Street_S_Catharine Street_W
857407,1990,39.93823,-75.14993,South 4th Street,South 4th Street,S,S,20,1,12,South 4th Street_S_South 4th Street_S


In [None]:
has_nans = ['EntryStreetName', 'ExitStreetName']

In [36]:
X_train_r = pd.get_dummies(X_train, 
                           columns = ['EntryStreetName', 'ExitStreetName', 'EntryHeading',
                                      'ExitHeading', 'Hour', 'Month', 'Path'],
                           drop_first = True)

In [50]:
y_train_1 = train['TotalTimeStopped_p20']
y_train_2 = train['TotalTimeStopped_p50']
y_train_3 = train['TotalTimeStopped_p80']

In [38]:
X_train_r.shape

(857409, 18557)

In [42]:
len(X_train)

857409

We build a dataloader for use by tensorflow:

In [55]:
def make_input(X, y, n_epochs = None, shuffle = True):
    def input_f():
        dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)) 
        if shuffle:
            dataset = dataset.shuffle(len(y))
        dataset = dataset.repeat(n_epochs)
        dataset = dataset.batch(len(y))
    
        return dataset
    return input_f

In [45]:
feature_columns = X_train_r.columns.values

In [59]:
train_input_1 = make_input(X_train_r, y_train_1)
train_input_2 = make_input(X_train_r, y_train_2)
train_input_3 = make_input(X_train_r, y_train_2)

# Initial model

In [61]:
import tensorflow as tf
import hyperopt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.estimator import BoostedTreesRegressor

We can train a basic boosted trees model in tensorflow:

In [None]:
model_1 = BoostedTreeRegressor()
model_1.train(train_input_fn, max_steps = 100)

### What to do for HyperOpt
Describe
- objective to minimize
- space over which to search
- database to store point evaluations
- search algorithm to use