# Extracting Data from Foursquare
06/5/19

* Refactor code from "Exploring Foursquare"

In [1]:
import numpy as np # library to handle data in a vectorized manner
import math   # for ceil function

# from time import sleep        # For Foursquare queries per second limit if any

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# #!conda install -c conda-forge geopy --yes # uncomment this line if needed
# from geopy.geocoders import Nominatim # convert address into latitude and longitude

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
# import matplotlib.cm as cm
# import matplotlib.colors as colors

# import k-means from clustering stage
# from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if needed
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [0]:
# @hidden cell
# Foursquare information

CLIENT_ID = 'CWCPNAVXDH3TI1BGS4VED4ANSUKEFGHBA4511GRPYPKPNJRD' # your Foursquare ID
CLIENT_SECRET = 'ALMFEYIENSPH3RV3TQB1NGWTKTANVJ5QTHAVZ5B1GRJWIP21' # your Foursquare Secret

# Foursquare query information that will be constant during this project
VERSION = '20190214' # Foursquare API version
LIMIT = 120 

In [0]:
# define function that extracts the category of the venue
# This function comes from Coursera Applied Data Science Capstone class
# used in Foursquare section

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [0]:
def split_box(box, purpose="number"):
    # Breaks a map "box" into smaller pieces
    # purpose = "size" - if the size of the box exceeds Foursquare's max size
    # purpose = "number" - if the number of returned results is 100 or more,
    #                      indicating that the query "maxed out"
    
    MAX_DELTA_LAT = 0.8   # Max allowable difference in latitude between corners
    MAX_DELTA_LNG = 0.9   # Max difference in longitude between corners
    
    delta_lat = abs(box[0][0] - box[1][0])
    delta_lng = abs(box[0][1] - box[1][1])
    
    boxes = [] # Will hold 1 or more map boxes
    
    if purpose == "number":  # Splitting box because too many results returned
        lat_divisions = lng_divisions = 2 # Break the box into 4 sub-boxesx
        
    elif purpose == "size":  # Splitting because box is too large for Foursquare
        lat_divisions = math.ceil(delta_lat / MAX_DELTA_LAT)
        lng_divisions = math.ceil(delta_lng / MAX_DELTA_LNG)
        
    else:  
        raise ValueError("This function only supports 'size' and 'number' box splitting")
    
    # Create sub-boxes based on number of lat & lng divisions
    for i in range(lat_divisions):
        for j in range(lng_divisions):
            lat_side = delta_lat / lat_divisions
            lng_side = delta_lng / lng_divisions
            swij = (box[0][0] + i * lat_side, box[0][1] + j * lng_side)
            neij = (swij[0] + lat_side, swij[1] + lng_side)
            boxes.append((swij, neij))            
    
    return boxes   

In [0]:
def query_box(box, codes):

  # define the Foursquare URL

  # Foursquare category ID requires comma-separated text string
  code_string = ",".join(codes)    
    
  url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}' + \
      f'&client_secret={CLIENT_SECRET}&v={VERSION}&sw={box[0][0]},{box[0][1]}' + \
      f'&ne={box[1][0]},{box[1][1]}' + \
      f'&categoryId={code_string}&limit={LIMIT}'
  
  # submit the url and capture the returned text
  venues = requests.get(url).json()["response"]['groups'][0]['items']
  
  # if any venues are captured, process and return dataframe. Otherwise, return None. 
  if venues:
  
    venues = json_normalize(venues) # flatten JSON

    # select desired columns
    filtered_columns = ['venue.name', 'venue.categories', 'venue.location.postalCode', 
                        'venue.location.lat', 'venue.location.lng', 'venue.location.state']
    venues = venues.loc[:, filtered_columns]

    # filter the category for each row
    venues['venue.categories'] = venues.apply(get_category_type, axis=1)

    # clean column names
    venues.columns = [col.split(".")[-1] for col in venues.columns]
  
    return venues
  
  else:
    
    return None  # no venues in box meet criteria    

In [0]:
def shrink_box(box, codes, venue_list, query_list):
    
    # Get venues for box (if any) in Pandas dataframe
    box_results = query_box(box, codes)

    try: # This will succeed if 1 or more venues are returned in box_results
        num_clients = box_results.shape[0]

        if num_clients < 100:  
            # Case where results did not "max out" at 100 returned venues
            venue_list += box_results.values.tolist()
            query_list.append((box, num_clients))

        else:  # This is the case where num_clients > 100
            for sub_box in split_box(box, purpose="number"):
                shrink_box(sub_box, codes, venue_list, query_list)

    except: # happens if query returns no results
        query_list.append((box, 0))

    return # venue_list and query_list are mutable, no need to return

In [0]:
def GetVenuesByBox(start_box,  # a tuple containing sw & ne corners of box
                   codes):     # a list of Foursquare category strings

    venue_list = []      # list of venus of interest
    query_list = []      # boxes and number of venues in each, for visualization
    

    # if the start box is too large, split it into smaller boxes
    for box in split_box(start_box, purpose="size"):
        # List will contain one or more boxes. Obtain venues for each
        shrink_box(box, codes, venue_list, query_list)
        
    # Add venue data to a dataframe    
    venues = pd.DataFrame(venue_list, columns=['name', 'categories', 'postalCode', \
                                                  'lat', 'lng', 'state'])
    return venues, query_list

In [0]:
fsw = (30.356590, -85.468937)  # All Georgia 
fne = (34.987081, -80.687209)

two_codes =  ['4bf58dd8d48988d142941735', '4bf58dd8d48988d10f941735']

# bx_test_results = query_box((fsw,fne), two_codes)

df_venues, query_list = GetVenuesByBox((fsw,fne), two_codes)


In [53]:
df_venues.shape

(4345, 6)

In [55]:
set(df_venues.state)

{'AL', 'FL', 'Florida', 'GA', 'Georgia', 'NC', 'SC', 'South Carolina'}

In [0]:
df_georgia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]

In [111]:
df_georgia[df_georgia['categories'].isin(bad_list)].shape

(118, 6)

In [71]:
print(len(set(df_georgia['categories'])))
set(df_georgia['categories'])

75


{'American Restaurant',
 'Asian Restaurant',
 'BBQ Joint',
 'Bakery',
 'Bar',
 'Breakfast Spot',
 'Bubble Tea Shop',
 'Buffet',
 'Café',
 'Cantonese Restaurant',
 'Caribbean Restaurant',
 'Chaat Place',
 'Chinese Restaurant',
 'Cocktail Bar',
 'Coffee Shop',
 'Convenience Store',
 'Deli / Bodega',
 'Dim Sum Restaurant',
 'Dumpling Restaurant',
 'Farmers Market',
 'Fast Food Restaurant',
 'Filipino Restaurant',
 'Food & Drink Shop',
 'Food Court',
 'Food Truck',
 'Fried Chicken Joint',
 'Garden',
 'Gas Station',
 'Grocery Store',
 'Hot Dog Joint',
 'Hotel Bar',
 'Hotpot Restaurant',
 'Hunan Restaurant',
 'Indian Chinese Restaurant',
 'Indian Restaurant',
 'Indian Sweet Shop',
 'Indonesian Restaurant',
 'Italian Restaurant',
 'Japanese Curry Restaurant',
 'Japanese Restaurant',
 'Jiangsu Restaurant',
 'Juice Bar',
 'Karaoke Bar',
 'Korean Restaurant',
 'Malay Restaurant',
 'Mexican Restaurant',
 'Middle Eastern Restaurant',
 'Mongolian Restaurant',
 'New American Restaurant',
 'Noodle Ho

In [0]:
bad_list = [ 'American Restaurant',
 # 'Asian Restaurant',
 'BBQ Joint',
 'Bakery',
 'Bar',
 'Breakfast Spot',
 # 'Bubble Tea Shop',
 'Buffet',
 'Café',
 # 'Cantonese Restaurant',
 'Caribbean Restaurant',
 'Chaat Place',
 # 'Chinese Restaurant',
 'Cocktail Bar',
 'Coffee Shop',
 'Convenience Store',
 'Deli / Bodega',
 # 'Dim Sum Restaurant',
 # 'Dumpling Restaurant',
 'Farmers Market',
 'Fast Food Restaurant',
 # 'Filipino Restaurant',
 'Food & Drink Shop',
 'Food Court',
 'Food Truck',
 'Fried Chicken Joint',
 'Garden',
 'Gas Station',
 'Grocery Store',
 'Hot Dog Joint',
 'Hotel Bar',
 # 'Hotpot Restaurant',
 # 'Hunan Restaurant',
 # 'Indian Chinese Restaurant',
 # 'Indian Restaurant',
 # 'Indian Sweet Shop',
 # 'Indonesian Restaurant',
 'Italian Restaurant',
 # 'Japanese Curry Restaurant',
 # 'Japanese Restaurant',
 # 'Jiangsu Restaurant',
 'Juice Bar',
 # 'Karaoke Bar',
 # 'Korean Restaurant',
 # 'Malay Restaurant',
 'Mexican Restaurant',
 'Middle Eastern Restaurant',
 'Mongolian Restaurant',
 'New American Restaurant',
 # 'Noodle House',
 # 'North Indian Restaurant',
 'Pizza Place',
 # 'Poke Place',
 # 'Ramen Restaurant',
 # 'Restaurant',
 'Salon / Barbershop',
 # 'Sandwich Place',
 # 'Seafood Restaurant',
 # 'Shabu-Shabu Restaurant',
 # 'Shanghai Restaurant',
 'Snack Place',
 # 'Soup Place',
 # 'South Indian Restaurant',
 'Sports Bar',
 'Steakhouse',
 'Supermarket',
 # 'Sushi Restaurant',
 # 'Szechuan Restaurant',
 'Taco Place',
 # 'Taiwanese Restaurant',
 'Tea Room',
 # 'Thai Restaurant',
 'Vegetarian / Vegan Restaurant',
 # 'Vietnamese Restaurant',
 'Wings Joint']

In [109]:
len(bad_list)

40

In [41]:
sw_min_lat = query_list[0][0][0][0]
sw_min_lng = query_list[0][0][0][1]
ne_max_lat = query_list[0][0][1][0]
ne_max_lng = query_list[0][0][1][1]
min_clients = max_clients = query_list[0][1]

for query in query_list:
    sw_min_lat = min(sw_min_lat, query[0][0][0])
    sw_min_lng = min(sw_min_lng, query[0][0][1])
    ne_max_lat = max(ne_max_lat, query[0][1][0])
    ne_max_lng = max(ne_max_lng, query[0][1][1])
    min_clients = min(min_clients, query[1])
    max_clients = max(max_clients, query[1])   

center = [(sw_min_lat + ne_max_lat)/2, (sw_min_lng + ne_max_lng)/2]

print(sw_min_lat, sw_min_lng, ne_max_lat, ne_max_lng, min_clients, max_clients)
print(center)

30.35659 -85.468937 34.987081 -80.687209 0 97
[32.6718355, -83.07807299999999]


In [0]:
map_georgia = folium.Map(location=center, zoom_start=7)
    
for i, box in enumerate(query_list):
    num_clients = box[1]
    sw, ne = box[0][0], box[0][1]
    area = (sw[0] - ne[0]) * (sw[1] - ne[1])
    
    opacity = box[1]/max_clients
                  
    folium.Rectangle([sw, ne], 
                     popup=str(box[1]), color="black", opacity=0.99, fill=True, 
                     fill_color='orange', weight = 2,
                     fill_opacity=(opacity)).add_to(map_georgia)
    
map_georgia

In [19]:
query_list[1]

(((33.913157, -84.141344), (33.941227, -84.101701)), 2)

In [9]:
print(df_venues.shape)
df_venues.head()


(374, 6)


Unnamed: 0,name,categories,postalCode,lat,lng,state
0,Ashiana,Indian Restaurant,30071.0,33.914464,-84.207432,GA
1,I Luv Pho 2,Asian Restaurant,30096.0,33.942934,-84.159522,GA
2,Seo Ra Beol Restaurant,Asian Restaurant,30096.0,33.963876,-84.141964,GA
3,SunO Dessert,Asian Restaurant,30096.0,33.96918,-84.14504,GA
4,Chow King,Asian Restaurant,,33.932097,-84.179774,GA


In [10]:
query_list

[(((33.913157, -84.22063), (33.969297, -84.141344)), 66),
 (((33.913157, -84.141344), (33.941227, -84.101701)), 2),
 (((33.913157, -84.101701), (33.941227, -84.06205800000001)), 3),
 (((33.941227, -84.141344), (33.955262, -84.1215225)), 31),
 (((33.941227, -84.1215225), (33.955262, -84.10170099999999)), 2),
 (((33.955262, -84.141344), (33.962279499999994, -84.13143325)), 51),
 (((33.955262, -84.13143325), (33.962279499999994, -84.1215225)), 17),
 (((33.962279499999994, -84.141344), (33.969297, -84.13143325)), 31),
 (((33.962279499999994, -84.13143325), (33.969297, -84.1215225)), 4),
 (((33.955262, -84.1215225), (33.969297, -84.10170099999999)), 7),
 (((33.941227, -84.101701), (33.969297, -84.06205800000001)), 2),
 (((33.969297, -84.22063), (34.025437, -84.141344)), 97),
 (((33.969297, -84.141344), (34.025437, -84.06205800000001)), 61)]

In [22]:
flat_venues = []
for result in venue_test:
    flat_venues += result
    
len(flat_venues)

374

In [0]:
venue_test[0]

In [46]:
query_list[0]


(((33.913157, -84.22063), (33.969297, -84.141344)), 66)

In [11]:
query_list

[(((33.913157, -84.22063), (33.969297, -84.141344)), 66),
 (((33.913157, -84.141344), (33.941227, -84.101701)), 2),
 (((33.913157, -84.101701), (33.941227, -84.06205800000001)), 3),
 (((33.941227, -84.141344), (33.955262, -84.1215225)), 31),
 (((33.941227, -84.1215225), (33.955262, -84.10170099999999)), 2),
 (((33.955262, -84.141344), (33.962279499999994, -84.13143325)), 51),
 (((33.955262, -84.13143325), (33.962279499999994, -84.1215225)), 17),
 (((33.962279499999994, -84.141344), (33.969297, -84.13143325)), 31),
 (((33.962279499999994, -84.13143325), (33.969297, -84.1215225)), 4),
 (((33.955262, -84.1215225), (33.969297, -84.10170099999999)), 7),
 (((33.941227, -84.101701), (33.969297, -84.06205800000001)), 2),
 (((33.969297, -84.22063), (34.025437, -84.141344)), 97),
 (((33.969297, -84.141344), (34.025437, -84.06205800000001)), 61)]

In [0]:
query_list

In [0]:
split_box([(0,0),(2.5,3.6)], "size")
    

In [0]:
test_results

In [0]:
test_results.values.tolist()

In [37]:
jd_list = [1, 2, 3]
jd_2 = [4, 5]
jd_list + jd_2


[1, 2, 3, 4, 5]

In [0]:
jd_list.append(jd_2)

In [39]:
jd_list

[1, 2, 3, [4, 5]]

In [0]:
def jd_test(a_list):
    a_list.append(427)
    return

In [28]:
jd_test(jd_list)
jd_list

[427, 427, 427, 427]

In [35]:
jd_list

NameError: ignored

In [0]:
venue_test, box_test = [], []

for box in split_box((fsw,fne), purpose="size"):
    # List will contain one or more boxes. Obtain venues for each
    shrink_box(box, two_codes, venue_test, box_test)