# Extracting Data from Foursquare
06/5/19

* Refactor code from "Exploring Foursquare"

# Libraries and Functions

In [2]:
import numpy as np # library to handle data in a vectorized manner
# import math   # for ceil function

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# #!conda install -c conda-forge geopy --yes # uncomment this line if needed
# from geopy.geocoders import Nominatim # convert address into latitude and longitude

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
# import matplotlib.cm as cm
# import matplotlib.colors as colors

# import k-means from clustering stage
# from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if needed
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [0]:
# @hidden cell
# Foursquare information

CLIENT_ID = 'CWCPNAVXDH3TI1BGS4VED4ANSUKEFGHBA4511GRPYPKPNJRD' # your Foursquare ID
CLIENT_SECRET = 'ALMFEYIENSPH3RV3TQB1NGWTKTANVJ5QTHAVZ5B1GRJWIP21' # your Foursquare Secret

# Foursquare query information that will be constant during this project
VERSION = '20190214' # Foursquare API version
LIMIT = 120 

In [0]:
# define function that extracts the category of the venue
# This function comes from Coursera Applied Data Science Capstone class
# used in Foursquare section

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
          
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [0]:
def split_box(box, purpose="number"):
    # Breaks a map "box" into smaller pieces
    # purpose = "size" - if the size of the box exceeds Foursquare's max size
    # purpose = "number" - if the number of returned results is 100 or more,
    #                      indicating that the query "maxed out"
    
    MAX_DELTA_LAT = 0.9   # Max allowable difference in latitude between corners
    MAX_DELTA_LNG = 0.9   # Max difference in longitude between corners
    
    delta_lat = abs(box[0][0] - box[1][0])
    delta_lng = abs(box[0][1] - box[1][1])
    
    boxes = [] # Will hold 1 or more map boxes
    
    if purpose == "number":  # Splitting box because too many results returned
        lat_divisions = lng_divisions = 2 # Break the box into 4 sub-boxesx
        
    elif purpose == "size":  # Splitting because box is too large for Foursquare
        lat_divisions = int(np.ceil(delta_lat / MAX_DELTA_LAT))
        lng_divisions = int(np.ceil(delta_lng / MAX_DELTA_LNG))
        
    else:  
        raise ValueError("This function only supports 'size' and 'number' box splitting")
    
    # Create sub-boxes based on number of lat & lng divisions
    for i in range(lat_divisions):
        for j in range(lng_divisions):
            lat_side = delta_lat / lat_divisions
            lng_side = delta_lng / lng_divisions
            swij = (box[0][0] + i * lat_side, box[0][1] + j * lng_side)
            neij = (swij[0] + lat_side, swij[1] + lng_side)
            boxes.append((swij, neij))            
    
    return boxes   

In [0]:
def query_box(box, codes):
    print(".", end="") # Indicate activity while performing queries

  # define the Foursquare URL

  # Foursquare category ID requires comma-separated text string
    code_string = ",".join(codes)    
    
    url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}' + \
        f'&client_secret={CLIENT_SECRET}&v={VERSION}&sw={box[0][0]},{box[0][1]}' + \
        f'&ne={box[1][0]},{box[1][1]}' + \
        f'&categoryId={code_string}&limit={LIMIT}'
  
  # submit the url and capture the returned text
    venues = requests.get(url).json()["response"]['groups'][0]['items']
  
  # if any venues are captured, process and return dataframe. Otherwise, return None. 
    if venues:
        venues = json_normalize(venues) # flatten JSON

        # select desired columns
        filtered_columns = ['venue.name', 'venue.categories', # 'venue.location.postalCode', 
                        'venue.location.lat', 'venue.location.lng', 'venue.location.state']
        venues = venues.loc[:, filtered_columns]

        # filter the category for each row
        venues['venue.categories'] = venues.apply(get_category_type, axis=1)

        # clean column names
        venues.columns = [col.split(".")[-1] for col in venues.columns]
  
        return venues
  
    else:
        return None  # no venues in box meet criteria    

In [0]:
def shrink_box(box, codes, venue_list, query_list):
    
    # Get venues for box (if any) in Pandas dataframe
    box_results = query_box(box, codes)

    try: # This will succeed if 1 or more venues are returned in box_results
        num_clients = box_results.shape[0]

        if num_clients < 100:  
            # Case where results did not "max out" at 100 returned venues
            venue_list += box_results.values.tolist()
            query_list.append((box, num_clients))

        else:  # This is the case where num_clients > 100
            for sub_box in split_box(box, purpose="number"):
                shrink_box(sub_box, codes, venue_list, query_list)

    except: # happens if query returns no results
        query_list.append((box, 0))

    return # venue_list and query_list are mutable, no need to return

In [0]:
def GetVenuesByBox(start_box,  # a tuple containing sw & ne corners of box
                   codes):     # a list of Foursquare category strings

    venue_list = []      # list of venus of interest
    query_list = []      # boxes and number of venues in each, for visualization
    
    # if the start box is too large, split it into smaller boxes
    for box in split_box(start_box, purpose="size"):
        # List will contain one or more boxes. Obtain venues for each
        shrink_box(box, codes, venue_list, query_list)
        
    # Add venue data to a dataframe    
    venues = pd.DataFrame(venue_list, columns=['name', 'categories', # 'postalCode', \
                                                  'lat', 'lng', 'state'])
    return venues, query_list

# Requesting Data from Foursquare

Foursquare allows you to request data by defining a bounding box using latitude & longitude coordinates. I want to request all Asian (including Indian & Pakistani) restaurants in a box drawn around the state of Georgia. 

There are two problems. First, Foursquare has a size limit (about 10,000 square kilometers). Georgia is much larger. Also, Foursquare will only return 100 venues per request (at least with my account type). My code handles these issues by dividing the box into smaller sub-boxes until all boxes are small enough for Foursquare. Then every time a query on a sub-box returns 100 results, the sub-box is split into 4 pieces and a new query is run on each one. This process repeats until all queries return <100 results. Any queries with 100 results are thrown away to avoid duplication.

Let's begin by drawing a large box around Georgia and requesting data. We will cover parts of the adjacent states but we will drop any rows that aren't in Georgia later.

In [9]:
# Define the southwest and northeast corners of a box around the state of Georgia

# fsw = (33.946964, -84.150187)  # Small Box for testing
# fne = (33.971041, -84.121381)
fsw = (30.357851, -85.605165)  # Southwest corner, Georgia USA
fne = (35.000659, -80.839729)  # Northeast corner

# The following venue categories come from Foursquare. Note that each 
# category has sub-categories (for example, "Korean Restaurant" in "Asian").
restaurant_codes =  ['4bf58dd8d48988d142941735'   # Asian Restaurant
                    ]   

# Request the data
df_venues, query_list = GetVenuesByBox((fsw,fne), restaurant_codes)

........................................................................................................................................................

## Data gathering process

Let's see how the code split the data request. Each box in the picture below represents a Foursquare query that returned 99 or fewer results. The shade of the box shows the number of returned values (you can click on the box to see the number). This is NOT a picture of the data -- we will see that in much more detail later.

In [11]:
# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]

# Find the maximum number of venues in any square (to calculate fill opacity)
max_clients = max([query[1] for query in query_list])

# Visualize all queries on a map, with fill color indicating number of restaurants
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True)   
    
for box in query_list:
    sw, ne = box[0][0], box[0][1]
    opacity = box[1]/max_clients   # Clients in this box / max across all boxes          
    folium.Rectangle([sw, ne], 
                     popup=str(box[1]), color="black", opacity=0.99, fill=True, 
                     fill_color='orange', weight = 1, 
                     fill_opacity=(opacity)).add_to(map_georgia)
    
map_georgia

In the picture above you can see we started with a 6x6 grid, for 36 queries, to satisfy the size limit. After throwing away all queries returning 100 results (the max limit), how many successful queries did we end up with?

In [12]:
# Number of queries returning <100 results
len(query_list)

123

To get to a total of 141, boxes were split (123 - 36)/3 = 29 times. Data from those boxes were  thrown away, meaning 35 queries were wasted. The total number of queries to get my data from Georgia was 123 +  29 = 152. 

However, the smallest boxes are about 2 square miles in size. Georgia covers almost 60,000 square miles. **Over 25,000 queries would be needed to get this data with equal-sized boxes.** So my method is a bit better than that.




# Data Cleaning 

Now let's look at the data we've obtained, and get rid of any venues we don't need for our analysis

In [13]:
print("Number of restaurants in data set: ",df_venues.shape[0],"\n\n")
df_venues.head()

Number of restaurants in data set:  3984 




Unnamed: 0,name,categories,lat,lng,state
0,Super Canton Chinese Restaurant,Chinese Restaurant,30.951445,-85.516435,FL
1,Fortune Cookie,Chinese Restaurant,30.783262,-85.24828,FL
2,New Star Chinese Restaurant,Asian Restaurant,30.754914,-85.549341,FL
3,King House,Asian Restaurant,30.442998,-85.05437,FL
4,Panda Buffet,Chinese Restaurant,30.774442,-85.223421,FL


My bounding-box approach obtained restaurants in parts of adjacent states. Let's get rid of those.

In [14]:
# Which states are included in the data set?

print(list(set(df_venues.state)))

['TN', 'Florida', 'AL', 'Alabama', 'FL', 'SC', 'Georgia', 'GA', 'South Carolina']


In [17]:
# Get rid of restaurants in other states and count the results.
df_georgia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]
print("Number of restaurants Georgia: ",df_georgia.shape[0],"\n\n")

Number of restaurants Georgia:  3066 




That's better. In theory, this approach would prevent duplication in the data set, but I always like to be sure. Let's check for duplicate venues.

In [18]:
print("Before dropping duplicates: ",df_georgia.shape[0])
df_georgia = df_georgia.drop_duplicates(keep='first')
print("After dropping duplicates: ",df_georgia.shape[0])

Before dropping duplicates:  3066
After dropping duplicates:  3066


Foursquare did a good job of giving us a unique data set. I also looked at the data and confirmed that all rows contain values for restaurant name, categories, lat, and lng. Now let's look at the restaurant categories. Did we only get the ones we wanted?

In [19]:
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese Restaurant,1044,1044,1044,1044
Asian Restaurant,655,655,655,655
Japanese Restaurant,449,449,449,449
Sushi Restaurant,243,243,243,243
Korean Restaurant,211,211,211,211
Thai Restaurant,177,177,177,177
Vietnamese Restaurant,92,92,92,92
Noodle House,32,32,32,32
Ramen Restaurant,14,14,14,14
Bakery,9,9,9,9


Convenience store? Italian restaurant? Garden? American restaurant? Clearly we need to get rid of some of these venues. In the cell below, I go through the list of categories and define a list of ones we want to get rid of. I generated the list using this command:

    sorted(list(set(df_georgia['categories'])))

In many cases, I used a command like the following to see which restaurants were in a suspicious category: 

     df_georgia[df_georgia['categories'].isin(['Taco Place'])]

You can see my comments in the lines below.

In [42]:
df_georgia[df_georgia['categories'].isin(['Food Truck'])]

Unnamed: 0,name,categories,lat,lng,state
1453,Genki-To-Go Food Truck,Food Truck,33.806352,-84.413191,GA
1870,Yumbii,Food Truck,33.845331,-84.368455,GA


In [0]:
bad_list = ['American Restaurant',
# 'Asian Restaurant',
# 'BBQ Joint',
 'Bakery',
 'Bar',
 'Breakfast Spot',
# 'Bubble Tea Shop',
# 'Buffet',   # One, Japanese
 'Café',
# 'Cantonese Restaurant',
 'Caribbean Restaurant',
# 'Chinese Restaurant',
 'Cocktail Bar',
 'Coffee Shop',
 'Deli / Bodega',
# 'Dim Sum Restaurant',
# 'Dumpling Restaurant',
 'Fast Food Restaurant',
# 'Filipino Restaurant',
 'Food Court',
# 'Food Truck',
# 'Fried Chicken Joint',
# 'Garden',   # One instance. Chinese restaurant.
 'Gas Station',
 'Grocery Store',
# 'Hot Dog Joint',  # One instance, Korean fried hotdogs. Must. Try. This.
 'Hotel Bar',
# 'Hotpot Restaurant',
# 'Indian Chinese Restaurant',
# 'Indian Restaurant',
# 'Indonesian Restaurant',
# 'Japanese Curry Restaurant',
# 'Japanese Restaurant',
# 'Jiangsu Restaurant',
# 'Karaoke Bar',
# 'Korean Restaurant',
# 'Malay Restaurant',
 'Mexican Restaurant',
# 'Middle Eastern Restaurant',
# 'Mongolian Restaurant',
# 'Noodle House',
# 'Poke Place',  # Hawaiian, but we'll call it Japanese
# 'Ramen Restaurant',
 'Restaurant',
 'Salon / Barbershop',
# 'Sandwich Place',
 'Seafood Restaurant',
# 'Shabu-Shabu Restaurant',
# 'Shanghai Restaurant',
# 'Soup Place',
 'Sports Bar',
# 'Steakhouse',  # Two. Both Japanese
# 'Supermarket',  # Four. All have restaurants inside
# 'Sushi Restaurant',
# 'Szechuan Restaurant',
# 'Taco Place', # One location "Hankook Taqueria" Korean
# 'Taiwanese Restaurant',
# 'Tea Room',  # One Location, Chinese
# 'Thai Restaurant',
# 'Vietnamese Restaurant',
 'Wings Joint']


Let's see how many restaurants are in the categories we don't want, and drop them from the data frame.

In [43]:
print("Number of restaurants in unwanted categories (dropped): ", df_georgia[df_georgia['categories'].isin(bad_list)].shape[0])
df_georgia = df_georgia[~df_georgia['categories'].isin(bad_list)]
print("Restaurants remaining in dataset: ", df_georgia.shape[0])

Number of restaurants in unwanted categories (dropped):  0
Restaurants remaining in dataset:  2999


In [0]:
# DELETE PRIOR TO PROJECT COMPLETION
# Now let's save this as a csv file.
file_contents = df_georgia.to_csv(index=False)
with(open('georgia.csv', 'w')) as file1:
    file1.write(file_contents)

In [0]:
a_dictionary = {'Indonesian Restaurant': 'a',
 'Poke Place': 'b',
 'Noodle House': 'c',
 'Ramen Restaurant': 'd',
 'Bubble Tea Shop': 'e',
 'Indian Restaurant': 'f',
 'Steakhouse': 'g'}

In [0]:
a_dictionary['Noodle House']

'c'

In [44]:
df_georgia[['categories','name']].groupby('categories').count()\
.sort_values(by='name', ascending=False)

Unnamed: 0_level_0,name
categories,Unnamed: 1_level_1
Chinese Restaurant,1044
Asian Restaurant,655
Japanese Restaurant,449
Sushi Restaurant,243
Korean Restaurant,211
Thai Restaurant,177
Vietnamese Restaurant,92
Noodle House,32
Ramen Restaurant,14
Szechuan Restaurant,9


#South Asian - Data Gathering & Cleaning
The restaurant supply company also serves Indian & Pakistani restaurants. Let's repeat the above for those venues.

In [82]:
# Define the southwest and northeast corners of a box around the state of Georgia

fsw = (30.357851, -85.605165)  # Southwest corner, Georgia USA
fne = (35.000659, -80.839729)  # Northeast corner

# The following venue categories come from Foursquare. Note some 
# categories have sub-categories (for example, "Korean Restaurant" in "Asian").
restaurant_codes =  ['4bf58dd8d48988d10f941735',   # Indian Restaurant
                     '52e81612bcbc57f1066b79f8']    # Pakistani Restaurant   

# Request the data
df_venues, query_list = GetVenuesByBox((fsw,fne), restaurant_codes)
                     
# Drop restaurants not in Georgia.
df_south_asia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]

............................................

In [0]:
df_south_asia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]

In [0]:
# Create a list of categories we DON'T want. 
# List generated using this command: sorted(list(set(df_south_asia['categories'])))
# Categories to KEEP are commented OUT

bad_se = ['Arcade',
 'Bar',
 'Big Box Store',
 'Breakfast Spot',
 'Brewery',
# 'Chaat Place',
 'Chinese Restaurant',
 'Fast Food Restaurant',
# 'Food & Drink Shop',
 'Food Truck',
# 'Gourmet Shop',
 'Grocery Store',
 'Hotel',
# 'Indian Chinese Restaurant',
# 'Indian Restaurant',
# 'Indian Sweet Shop',
# 'Italian Restaurant',
 'Mediterranean Restaurant',
# 'North Indian Restaurant',
# 'Pakistani Restaurant',
# 'Pizza Place',
 'Sandwich Place',
# 'South Indian Restaurant',
 'Sports Bar',
# 'Tea Room',
 'Vegetarian / Vegan Restaurant']

# Drop all rows with a category we don't want
df_south_asia = df_south_asia[~df_south_asia['categories'].isin(bad_se)]

In [103]:
# Let's look at the number of restaurants by category
df_south_asia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Indian Restaurant,265,265,265,265
North Indian Restaurant,7,7,7,7
South Indian Restaurant,6,6,6,6
Indian Chinese Restaurant,5,5,5,5
Pakistani Restaurant,4,4,4,4
Chaat Place,3,3,3,3
Food & Drink Shop,1,1,1,1
Gourmet Shop,1,1,1,1
Indian Sweet Shop,1,1,1,1
Italian Restaurant,1,1,1,1


In [0]:
# Let's add a South Asian column. This will be used for k means 
# clustering, later in this project.

df_south_asia['s_asia'] = 1

In [119]:
df_south_asia.shape

(296, 6)

In [0]:
# DELETE PRIOR TO PROJECT COMPLETION
# Now let's save this as a csv file.
file_contents = df_south_asia.to_csv(index=False)
with(open('south_asian.csv', 'w')) as file1:
    file1.write(file_contents)

# READ CSV FILES HERE

In [121]:
df_south_asia = pd.read_csv('https://raw.githubusercontent.com/JamesDCage/Final-Week-0/master/south_asian.csv')
df_south_asia.shape

(296, 6)

In [0]:
df_georgia = pd.read.csv()

# BLANK LINES FOR TESTING

In [118]:
df_jimmy.shape

(296, 6)

In [100]:
df_south_asia[df_south_asia['categories'].isin(['Vegetarian / Vegan Restaurant'])].sort_values(by='categories')

Unnamed: 0,name,categories,lat,lng,state
324,The Grit,Vegetarian / Vegan Restaurant,33.960165,-83.382409,GA


In [0]:
df_georgia.describe(include='all')

Unnamed: 0,name,categories,lat,lng,state
count,322,322,322.0,322.0,322
unique,296,26,,,2
top,Chinese Dhaba,Indian Restaurant,,,GA
freq,4,265,,,313
mean,,,33.749211,-84.111439,
std,,,0.601438,0.640293,
min,,,30.83518,-85.258985,
25%,,,33.796777,-84.3493,
50%,,,33.903164,-84.271429,
75%,,,34.035847,-84.164537,


In [0]:
set(df_georgia['categories'])

{'Arcade',
 'Bar',
 'Big Box Store',
 'Breakfast Spot',
 'Brewery',
 'Chaat Place',
 'Chinese Restaurant',
 'Fast Food Restaurant',
 'Food & Drink Shop',
 'Food Truck',
 'Gourmet Shop',
 'Grocery Store',
 'Hotel',
 'Indian Chinese Restaurant',
 'Indian Restaurant',
 'Indian Sweet Shop',
 'Italian Restaurant',
 'Mediterranean Restaurant',
 'North Indian Restaurant',
 'Pakistani Restaurant',
 'Pizza Place',
 'Sandwich Place',
 'South Indian Restaurant',
 'Sports Bar',
 'Tea Room',
 'Vegetarian / Vegan Restaurant'}

In [0]:
df_georgia[df_georgia['categories'].isin(['Grocery Store'])]

Unnamed: 0,name,categories,lat,lng,state
56,India Stop Shop,Grocery Store,33.489505,-84.583528,GA
74,Patel Brothers Grocery,Grocery Store,33.797515,-84.281206,GA
122,Whole Foods Market,Grocery Store,33.775056,-84.366082,GA
126,EZ Mart,Grocery Store,33.797039,-84.40769,GA
169,Trader Joe's,Grocery Store,33.926248,-84.378479,GA
170,Publix,Grocery Store,33.851035,-84.359765,GA
171,Whole Foods Market,Grocery Store,33.840813,-84.381535,GA
295,Spices Hut,Grocery Store,34.060564,-84.237753,GA
302,Cherians,Grocery Store,34.105463,-84.176128,GA
304,Suvidha Indo-Pak Grocery,Grocery Store,34.077406,-84.162503,GA


In [0]:
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False).tail(19)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Big Box Store,2,2,2,2
Food Truck,2,2,2,2
Italian Restaurant,1,1,1,1
Tea Room,1,1,1,1
Sports Bar,1,1,1,1
Sandwich Place,1,1,1,1
Pizza Place,1,1,1,1
Mediterranean Restaurant,1,1,1,1
Arcade,1,1,1,1
Indian Sweet Shop,1,1,1,1
