# Extracting Data from Foursquare
06/5/19

* Refactor code from "Exploring Foursquare"

In [1]:
import numpy as np # library to handle data in a vectorized manner
import math   # for ceil function

# from time import sleep        # For Foursquare queries per second limit if any

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# #!conda install -c conda-forge geopy --yes # uncomment this line if needed
# from geopy.geocoders import Nominatim # convert address into latitude and longitude

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
# import matplotlib.cm as cm
# import matplotlib.colors as colors

# import k-means from clustering stage
# from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if needed
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [0]:
# @hidden cell
# Foursquare information

CLIENT_ID = 'CWCPNAVXDH3TI1BGS4VED4ANSUKEFGHBA4511GRPYPKPNJRD' # your Foursquare ID
CLIENT_SECRET = 'ALMFEYIENSPH3RV3TQB1NGWTKTANVJ5QTHAVZ5B1GRJWIP21' # your Foursquare Secret

# Foursquare query information that will be constant during this project
VERSION = '20190427' # Foursquare API version
LIMIT = 120 

# del venues
# del box_venues
# del query_list

box_venues = pd.DataFrame(columns=['name', 'categories', 'postalCode', 'lat', 'lng', 'state'])
query_list = []  # boxes and number of venues in each, for visualization


In [0]:
# define function that extracts the category of the venue
# used in Foursquare section

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [0]:
def split_box(box, purpose="number"):
    MAX_DELTA_LAT = 0.8
    MAX_DELTA_LNG = 0.9
    delta_lat = abs(box[0][0] - box[1][0])
    delta_lng = abs(box[0][1] - box[1][1])
    
    boxes = []
    
    if purpose == "number":  # Splitting box because too many results returned
        lat_divisions = lng_divisions = 2
    elif purpose == "size":  # Splitting because box is too large for Foursquare
        
        lat_divisions = math.ceil(delta_lat / MAX_DELTA_LAT)
        lng_divisions = math.ceil(delta_lng / MAX_DELTA_LNG)
        
    else:  # This is an error.
        raise ValueError("This function only supports 'size' and 'number' box splitting")
    
    for i in range(lat_divisions):
        for j in range(lng_divisions):
            lat_side = delta_lat / lat_divisions
            lng_side = delta_lng / lng_divisions
            swij = (box[0][0] + i * lat_side, box[0][1] + j * lng_side)
            neij = (swij[0] + lat_side, swij[1] + lng_side)
            boxes.append((swij, neij))            
    
    return boxes
        
    

In [0]:
def shrink_box(box, codes):
   
    
    box_results = query_box(box, codes)

    try:
        num_clients, num_columns = box_results.shape

        if num_clients < 100:

            box_venues.append(box_results, ignore_index = True)
            query_list.append([box, num_clients], ignore_index = True)

        else:

            for sub_box in split_box(box, purpose="number"):
                shrink_box(sub_box, codes)

    except:

        query_list.append((box, 0))

    return

In [0]:
def query_box(box, codes):
    
  # Foursquare category ID requires comma-separated text string
  code_string = ",".join(codes)
  
  # define the search URL
  url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}' + \
      f'&client_secret={CLIENT_SECRET}&v={VERSION}&sw={box[0][0]},{box[0][1]}' + \
      f'&ne={box[1][0]},{box[1][1]}' + \
      f'&categoryId={code_string}&limit={LIMIT}'
  
  # submit the url and capture the returned text
  venues = requests.get(url).json()["response"]['groups'][0]['items']
  
  # if any venues are captured, process and return dataframe. Otherwise, return None. 
  if venues:
  
    venues = json_normalize(venues) # flatten JSON

    # filter columns
    filtered_columns = ['venue.name', 'venue.categories', 'venue.location.postalCode', 
                        'venue.location.lat', 'venue.location.lng', 'venue.location.state']
    venues = venues.loc[:, filtered_columns]

    # filter the category for each row
    venues['venue.categories'] = venues.apply(get_category_type, axis=1)

    # clean columns
    venues.columns = [col.split(".")[-1] for col in venues.columns]
  
    return venues
  
  else:
    
    return None  # no venues in box meet criteria    

In [0]:
def GetVenuesByBox(start_box,  # a tuple containing sw & ne corners of box
                   codes):

    
    # Create list of boxes from major box, obtain venues
    
    for box in split_box(start_box, purpose="size"):
        shrink_box(box, codes)
        
    return # box_venues, query_list

In [33]:
box_venues

Unnamed: 0,name,categories,postalCode,lat,lng,state


In [34]:
query_list

[]

In [0]:
fsw = (33.913157, -84.220630)  #The big box
fne = (34.025437, -84.062058)

two_codes =  ['4bf58dd8d48988d142941735', '4bf58dd8d48988d10f941735']



# test_results = query_box((fsw,fne), two_codes)

GetVenuesByBox((fsw,fne), two_codes)

In [36]:
box_venues

Unnamed: 0,name,categories,postalCode,lat,lng,state


In [38]:
query_list

[(((33.913157, -84.22063), (33.969297, -84.141344)), 0),
 (((33.913157, -84.141344), (33.941227, -84.101701)), 0),
 (((33.913157, -84.101701), (33.941227, -84.06205800000001)), 0),
 (((33.941227, -84.141344), (33.955262, -84.1215225)), 0),
 (((33.941227, -84.1215225), (33.955262, -84.10170099999999)), 0),
 (((33.955262, -84.141344), (33.962279499999994, -84.13143325)), 0),
 (((33.955262, -84.13143325), (33.962279499999994, -84.1215225)), 0),
 (((33.962279499999994, -84.141344), (33.969297, -84.13143325)), 0),
 (((33.962279499999994, -84.13143325), (33.969297, -84.1215225)), 0),
 (((33.955262, -84.1215225), (33.969297, -84.10170099999999)), 0),
 (((33.941227, -84.101701), (33.969297, -84.06205800000001)), 0),
 (((33.969297, -84.22063), (34.025437, -84.141344)), 0),
 (((33.969297, -84.141344), (34.025437, -84.06205800000001)), 0)]

In [19]:
split_box([(0,0),(2.5,3.6)], "size")
    

[((0.0, 0.0), (0.625, 0.9)),
 ((0.0, 0.9), (0.625, 1.8)),
 ((0.0, 1.8), (0.625, 2.7)),
 ((0.0, 2.7), (0.625, 3.6)),
 ((0.625, 0.0), (1.25, 0.9)),
 ((0.625, 0.9), (1.25, 1.8)),
 ((0.625, 1.8), (1.25, 2.7)),
 ((0.625, 2.7), (1.25, 3.6)),
 ((1.25, 0.0), (1.875, 0.9)),
 ((1.25, 0.9), (1.875, 1.8)),
 ((1.25, 1.8), (1.875, 2.7)),
 ((1.25, 2.7), (1.875, 3.6)),
 ((1.875, 0.0), (2.5, 0.9)),
 ((1.875, 0.9), (2.5, 1.8)),
 ((1.875, 1.8), (2.5, 2.7)),
 ((1.875, 2.7), (2.5, 3.6))]

In [0]:
venues

Unnamed: 0,name,categories,postalCode,lat,lng,state
