# Extracting external features from Google Maps

# Import Libraries

In [1]:
import os
import pandas as pd
import datetime as dt
import numpy as np
import datetime as dt
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics import mean_squared_error

import responses
import googlemaps
import yaml

# Load API key
see https://developers.google.com/maps/get-started for details

In [2]:
def load_api_key():
    with open('config.yml', 'r') as stream:
        try:
            config = yaml.safe_load(stream)
            api_key = config['api_key']
            return api_key
        except yaml.YAMLError as e:
            print(f"Error loading YAML file: {e}")
            return None

# Example usage
api_key = load_api_key()
client = googlemaps.Client(api_key)

load datasets

In [3]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

separate features from labels

In [4]:
# separate labels from features
full_training_label_cols = ['TotalTimeStopped_p20','TotalTimeStopped_p40','TotalTimeStopped_p50',
                       'TotalTimeStopped_p60','TotalTimeStopped_p80',
                       'TimeFromFirstStop_p20','TimeFromFirstStop_p40','TimeFromFirstStop_p50',
                       'TimeFromFirstStop_p60','TimeFromFirstStop_p80',
                       'DistanceToFirstStop_p20','DistanceToFirstStop_p40','DistanceToFirstStop_p50',
                       'DistanceToFirstStop_p60','DistanceToFirstStop_p80']
rel_training_label_cols = ['IntersectionId','TotalTimeStopped_p20','TotalTimeStopped_p50','TotalTimeStopped_p80',
                       'DistanceToFirstStop_p20','DistanceToFirstStop_p50','DistanceToFirstStop_p80']


df_train_y = df_train[rel_training_label_cols]

# drop columns not needed
df_test_X = df_test.drop(columns=['RowId'])
df_train_X = df_train.drop(columns=['RowId']+full_training_label_cols)

separate by city

In [5]:
cities = ['Atlanta','Boston','Chicago','Philadelphia']
city_training_datasets_X = {}
city_training_datasets_y = {}
city_testing_datasets_X = {}

# training
for city in cities:
    # get rows for this city
    city_idxs = (df_train_X['City'] == city).values.nonzero()[0]

    # get X (data) and y (labels)
    city_training_datasets_X[city] = df_train_X.loc[city_idxs]
    city_training_datasets_y[city] = df_train_y.loc[city_idxs]

    # drop the city column from X (data)
    city_training_datasets_X[city] = city_training_datasets_X[city].drop(columns=['City'])

# testing
for city in cities:
    # get rows for this city
    city_idxs = (df_test_X['City'] == city).values.nonzero()[0]

    # get X (data)
    city_testing_datasets_X[city] = df_test_X.loc[city_idxs]

    # drop the city column from X (data)
    city_testing_datasets_X[city] = city_testing_datasets_X[city].drop(columns=['City'])

# Functions to get data from google maps

In [6]:
def get_elev(long_lat_tuples):
    elevs = np.zeros(len(long_lat_tuples),dtype=float)
    start = 0
    while start+32 < len(elevs):
        result = client.elevation(long_lat_tuples[start:start+32])
        result = [sub_res['elevation'] for sub_res in result]
        elevs[start:start+32] = result[:]
        start += 32
    result = client.elevation(long_lat_tuples[start:len(elevs)])
    result = np.array([sub_res['elevation'] for sub_res in result])
    elevs[start:len(elevs)] = result[:]

    return elevs

def get_dist(long_lat_tuples,destination):
    dist_meters = np.zeros(len(long_lat_tuples),dtype=float)
    start = 0
    while start+16 < len(dist_meters):
        result = client.distance_matrix(long_lat_tuples[start:start+16],destination)
        result = result['rows']
        result = [sub_res['elements'][0]['distance']['value'] for sub_res in result]
        dist_meters[start:start+16] = result[:]
        start += 16
    result = client.distance_matrix(long_lat_tuples[start:len(dist_meters)],destination)
    result = result['rows']
    result = [sub_res['elements'][0]['distance']['value'] for sub_res in result]
    dist_meters[start:len(dist_meters)] = result[:]

    return dist_meters

def get_num_places_50(long_lat_tuples):
    num_places_50 = np.zeros(len(long_lat_tuples),dtype=float)
    for i,place in enumerate(tqdm(long_lat_tuples)):
        result = client.places_nearby(location=place,radius=50)
        result = str(result['results']).count('geometry')
        num_places_50[i] = result

    return num_places_50

In [7]:
df_center = pd.DataFrame({"Atlanta":[33.753746, -84.386330],
                             "Boston":[42.361145, -71.057083],
                             "Chicago":[41.881832, -87.623177],
                             "Philadelphia":[39.952583, -75.165222]})

Now get the data using the functions from above

In [8]:
for city in cities:
    print(city)
    grouped_train = city_training_datasets_X[city].groupby('IntersectionId').agg({'Latitude': 'first','Longitude': 'first'}).reset_index()
    grouped_test = city_testing_datasets_X[city].groupby('IntersectionId').agg({'Latitude': 'first','Longitude': 'first'}).reset_index()

    train_long_lat = grouped_train[['Latitude','Longitude']].values
    test_long_lat = grouped_test[['Latitude','Longitude']].values

    train_long_lat_tuples = [tuple(long_lat) for long_lat in train_long_lat]
    test_long_lat_tuples = [tuple(long_lat) for long_lat in test_long_lat]

    print('elevation')
    grouped_train['elevation'] = get_elev(train_long_lat_tuples)
    grouped_test['elevation'] = get_elev(test_long_lat_tuples)

    print('distance')
    grouped_train['cent_dist'] = get_dist(train_long_lat_tuples,[tuple(df_center[city])])
    grouped_test['cent_dist'] = get_dist(test_long_lat_tuples,[tuple(df_center[city])])

    print('num_places')
    grouped_train['num_places_50'] = get_num_places_50(train_long_lat_tuples)
    grouped_test['num_places_50'] = get_num_places_50(test_long_lat_tuples)

    city_training_datasets_X[city] = pd.merge(city_training_datasets_X[city], grouped_train, on='IntersectionId', how='left')
    city_testing_datasets_X[city] = pd.merge(city_testing_datasets_X[city], grouped_test, on='IntersectionId', how='left')

Atlanta
elevation
distance
num_places


100%|██████████| 377/377 [01:06<00:00,  5.67it/s]
100%|██████████| 468/468 [00:51<00:00,  9.09it/s]


Boston
elevation
distance
num_places


100%|██████████| 975/975 [03:05<00:00,  5.26it/s]
100%|██████████| 1192/1192 [02:56<00:00,  6.76it/s]


Chicago
elevation
distance
num_places


100%|██████████| 2135/2135 [07:04<00:00,  5.03it/s]
100%|██████████| 2571/2571 [07:00<00:00,  6.12it/s]


Philadelphia
elevation
distance
num_places


100%|██████████| 1318/1318 [04:00<00:00,  5.49it/s]
100%|██████████| 1716/1716 [03:47<00:00,  7.55it/s]


Save the data to disk. Need to upload to google drive.

In [10]:
with open('../data/gmaps_training_X.pickle', 'wb') as handle:
    pickle.dump(city_training_datasets_X, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../data/gmaps_testing_X.pickle', 'wb') as handle:
    pickle.dump(city_testing_datasets_X, handle, protocol=pickle.HIGHEST_PROTOCOL)