# Extracting Data from Foursquare
06/5/19

* Refactor code from "Exploring Foursquare"

# Libraries and Functions

In [2]:
print("Importing ...")
import numpy as np # library to handle data in a vectorized manner
# import math   # for ceil function

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# #!conda install -c conda-forge geopy --yes # uncomment this line if needed
# from geopy.geocoders import Nominatim # convert address into latitude and longitude

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
# import matplotlib.cm as cm
# import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if needed
import folium # map rendering library
from folium import plugins

print("Folium version: ", folium.__version__)

Importing ...
Folium version:  0.8.3


In [0]:
# @hidden cell
# Foursquare information

CLIENT_ID = 'CWCPNAVXDH3TI1BGS4VED4ANSUKEFGHBA4511GRPYPKPNJRD' # your Foursquare ID
CLIENT_SECRET = 'ALMFEYIENSPH3RV3TQB1NGWTKTANVJ5QTHAVZ5B1GRJWIP21' # your Foursquare Secret

# Foursquare query information that will be constant during this project
VERSION = '20190214' # Foursquare API version
LIMIT = 120 

In [0]:
# define function that extracts the category of the venue
# This function comes from Coursera Applied Data Science Capstone class
# used in Foursquare section

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
          
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [0]:
def split_box(box, purpose="number"):
    # Breaks a map "box" into smaller pieces
    # purpose = "size" - if the size of the box exceeds Foursquare's max size
    # purpose = "number" - if the number of returned results is 100 or more,
    #                      indicating that the query "maxed out"
    
    MAX_DELTA_LAT = 0.9   # Max allowable difference in latitude between corners
    MAX_DELTA_LNG = 0.9   # Max difference in longitude between corners
    
    delta_lat = abs(box[0][0] - box[1][0])
    delta_lng = abs(box[0][1] - box[1][1])
    
    boxes = [] # Will hold 1 or more map boxes
    
    if purpose == "number":  # Splitting box because too many results returned
        lat_divisions = lng_divisions = 2 # Break the box into 4 sub-boxesx
        
    elif purpose == "size":  # Splitting because box is too large for Foursquare
        lat_divisions = int(np.ceil(delta_lat / MAX_DELTA_LAT))
        lng_divisions = int(np.ceil(delta_lng / MAX_DELTA_LNG))
        
    else:  
        raise ValueError("This function only supports 'size' and 'number' box splitting")
    
    # Create sub-boxes based on number of lat & lng divisions
    for i in range(lat_divisions):
        for j in range(lng_divisions):
            lat_side = delta_lat / lat_divisions
            lng_side = delta_lng / lng_divisions
            swij = (box[0][0] + i * lat_side, box[0][1] + j * lng_side)
            neij = (swij[0] + lat_side, swij[1] + lng_side)
            boxes.append((swij, neij))            
    
    return boxes   

In [0]:
def query_box(box, codes):
    print(".", end="") # Indicate activity while performing queries

  # define the Foursquare URL

  # Foursquare category ID requires comma-separated text string
    code_string = ",".join(codes)    
    
    url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}' + \
        f'&client_secret={CLIENT_SECRET}&v={VERSION}&sw={box[0][0]},{box[0][1]}' + \
        f'&ne={box[1][0]},{box[1][1]}' + \
        f'&categoryId={code_string}&limit={LIMIT}'
  
  # submit the url and capture the returned text
    venues = requests.get(url).json()["response"]['groups'][0]['items']
  
  # if any venues are captured, process and return dataframe. Otherwise, return None. 
    if venues:
        venues = json_normalize(venues) # flatten JSON

        # select desired columns
        filtered_columns = ['venue.name', 'venue.categories', # 'venue.location.postalCode', 
                        'venue.location.lat', 'venue.location.lng', 'venue.location.state']
        venues = venues.loc[:, filtered_columns]

        # filter the category for each row
        venues['venue.categories'] = venues.apply(get_category_type, axis=1)

        # clean column names
        venues.columns = [col.split(".")[-1] for col in venues.columns]
  
        return venues
  
    else:
        return None  # no venues in box meet criteria    

In [0]:
def shrink_box(box, codes, venue_list, query_list):
    
    # Get venues for box (if any) in Pandas dataframe
    box_results = query_box(box, codes)

    try: # This will succeed if 1 or more venues are returned in box_results
        num_clients = box_results.shape[0]

        if num_clients < 100:  
            # Case where results did not "max out" at 100 returned venues
            venue_list += box_results.values.tolist()
            query_list.append((box, num_clients))

        else:  # This is the case where num_clients > 100
            for sub_box in split_box(box, purpose="number"):
                shrink_box(sub_box, codes, venue_list, query_list)

    except: # happens if query returns no results
        query_list.append((box, 0))

    return # venue_list and query_list are mutable, no need to return

In [0]:
def GetVenuesByBox(start_box,  # a tuple containing sw & ne corners of box
                   codes):     # a list of Foursquare category strings

    venue_list = []      # list of venus of interest
    query_list = []      # boxes and number of venues in each, for visualization
    
    # if the start box is too large, split it into smaller boxes
    for box in split_box(start_box, purpose="size"):
        # List will contain one or more boxes. Obtain venues for each
        shrink_box(box, codes, venue_list, query_list)
        
    # Add venue data to a dataframe    
    venues = pd.DataFrame(venue_list, columns=['name', 'categories', # 'postalCode', \
                                                  'lat', 'lng', 'state'])
    return venues, query_list

# Requesting Data from Foursquare

Foursquare allows you to request data by defining a bounding box using latitude & longitude coordinates. I want to request all Asian (including Indian & Pakistani) restaurants in a box drawn around the state of Georgia. 

There are two problems. First, Foursquare has a size limit (about 10,000 square kilometers). Georgia is much larger. Also, Foursquare will only return 100 venues per request (at least with my account type). My code handles these issues by dividing the box into smaller sub-boxes until all boxes are small enough for Foursquare. Then every time a query on a sub-box returns 100 results, the sub-box is split into 4 pieces and a new query is run on each one. This process repeats until all queries return <100 results. Any queries with 100 results are thrown away to avoid duplication.

Let's begin by drawing a large box around Georgia and requesting data. We will cover parts of the adjacent states but we will drop any rows that aren't in Georgia later.

In [0]:
# Define the southwest and northeast corners of a box around the state of Georgia

fsw = (30.357851, -85.605165)  # Southwest corner, Georgia USA
fne = (35.000659, -80.839729)  # Northeast corner

In [0]:
# The following venue categories come from Foursquare. Note that each 
# category has sub-categories (for example, "Korean Restaurant" in "Asian").
restaurant_codes =  ['4bf58dd8d48988d142941735'   # Asian Restaurant
                    ]   

# Request the data
df_venues, query_list = GetVenuesByBox((fsw,fne), restaurant_codes)

........................................................................................................................................................

## Data gathering process

Let's see how the code split the data request. Each box in the picture below represents a Foursquare query that returned 99 or fewer results. The shade of the box shows the number of returned values (you can click on the box to see the number). This is NOT a picture of the data -- we will see that in much more detail later.

In [0]:
# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]

# Find the maximum number of venues in any square (to calculate fill opacity)
max_clients = max([query[1] for query in query_list])

# Visualize all queries on a map, with fill color indicating number of restaurants
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True)

plugins.ScrollZoomToggler().add_to(map_georgia)
    
for box in query_list:
    sw, ne = box[0][0], box[0][1]
    opacity = box[1]/max_clients   # Clients in this box / max across all boxes          
    folium.Rectangle([sw, ne], 
                     popup=str(box[1]), color="black", opacity=0.99, fill=True, 
                     fill_color='orange', weight = 1, 
                     fill_opacity=(opacity)).add_to(map_georgia)
    
map_georgia

In the picture above you can see we started with a 6x6 grid, for 36 queries, to satisfy the size limit. After throwing away all queries returning 100 results (the max limit), how many successful queries did we end up with?

In [0]:
# Number of queries returning <100 results
len(query_list)

123

To get to a total of 141, boxes were split (123 - 36)/3 = 29 times. Data from those boxes were  thrown away, meaning 29 queries were wasted. The total number of queries to get my data from Georgia was 123 +  29 = 152. 

However, the smallest boxes are about 2 square miles in size. Georgia covers almost 60,000 square miles. **Over 25,000 queries would be needed to get this data with equal-sized boxes.** So my method is a bit better than that.




# Data Cleaning 

Now let's look at the data we've obtained, and get rid of any venues we don't need for our analysis

In [10]:
print("Number of restaurants in data set: ",df_venues.shape[0],"\n\n")
df_venues.head()

NameError: ignored

My bounding-box approach obtained restaurants in parts of adjacent states. Let's get rid of those.

In [0]:
# Which states are included in the data set?

print(list(set(df_venues.state)))

['TN', 'Florida', 'AL', 'Alabama', 'FL', 'SC', 'Georgia', 'GA', 'South Carolina']


In [0]:
# Get rid of restaurants in other states and count the results.
df_georgia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]
print("Number of restaurants Georgia: ",df_georgia.shape[0],"\n\n")

Number of restaurants Georgia:  3066 




That's better. In theory, this approach would prevent duplication in the data set, but I always like to be sure. Let's check for duplicate venues.

In [0]:
print("Before dropping duplicates: ",df_georgia.shape[0])
df_georgia = df_georgia.drop_duplicates(keep='first')
print("After dropping duplicates: ",df_georgia.shape[0])

Before dropping duplicates:  3066
After dropping duplicates:  3066


Foursquare did a good job of giving us a unique data set. I also looked at the data and confirmed that all rows contain values for restaurant name, categories, lat, and lng. Now let's look at the restaurant categories. Did we only get the ones we wanted?

In [0]:
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese Restaurant,1044,1044,1044,1044
Asian Restaurant,655,655,655,655
Japanese Restaurant,449,449,449,449
Sushi Restaurant,243,243,243,243
Korean Restaurant,211,211,211,211
Thai Restaurant,177,177,177,177
Vietnamese Restaurant,92,92,92,92
Noodle House,32,32,32,32
Ramen Restaurant,14,14,14,14
Szechuan Restaurant,9,9,9,9


Convenience store? Italian restaurant? Garden? American restaurant? Clearly we need to get rid of some of these venues. In the cell below, I go through the list of categories and define a list of ones we want to get rid of. I generated the list using this command:

    sorted(list(set(df_georgia['categories'])))

In many cases, I used a command like the following to see which restaurants were in a suspicious category: 

     df_georgia[df_georgia['categories'].isin(['Taco Place'])]

You can see my comments in the lines below.

In [0]:
bad_list = ['American Restaurant',
# 'Asian Restaurant',
# 'BBQ Joint',  # Three venues, mixed Asian
 'Bakery',
 'Bar',
 'Breakfast Spot',
# 'Bubble Tea Shop',
# 'Buffet',   # One, Japanese
 'Café',
# 'Cantonese Restaurant',
 'Caribbean Restaurant',
# 'Chinese Restaurant',
 'Cocktail Bar',
 'Coffee Shop',
 'Deli / Bodega',
# 'Dim Sum Restaurant',
 'Dumpling Restaurant',  # Two restaurants, both Korean, both permanently closed
 'Fast Food Restaurant',
# 'Filipino Restaurant',
 'Food Court',
# 'Food Truck',
# 'Fried Chicken Joint',  # One, Korean
# 'Garden',   # One instance. Chinese restaurant.
 'Gas Station',
 'Grocery Store',
# 'Hot Dog Joint',  # One instance, Korean fried hotdogs. Must. Try. This.
 'Hotel Bar',
# 'Hotpot Restaurant',
# 'Indian Chinese Restaurant',
# 'Indian Restaurant',
# 'Indonesian Restaurant',  # Four instances. Roll into "Asian"
# 'Japanese Curry Restaurant',
# 'Japanese Restaurant',
# 'Jiangsu Restaurant',
# 'Karaoke Bar',
# 'Korean Restaurant',
# 'Malay Restaurant',  # Five instances. Roll into "Asian"
 'Mexican Restaurant',
 'Middle Eastern Restaurant',
# 'Mongolian Restaurant',  Roll into "Asian"
# 'Noodle House',
# 'Poke Place',  # Hawaiian, but we'll call it Japanese
# 'Ramen Restaurant',
 'Restaurant',
 'Salon / Barbershop',
# 'Sandwich Place',  # Two. One is Korean, one is not Asian
 'Seafood Restaurant',
# 'Shabu-Shabu Restaurant',
# 'Shanghai Restaurant',
 'Soup Place',   # One instance, permanently closed
 'Sports Bar',
# 'Steakhouse',  # Two. Both Japanese
# 'Supermarket',  # Four. All have restaurants inside
# 'Sushi Restaurant',
# 'Szechuan Restaurant',
# 'Taco Place', # One location "Hankook Taqueria" Korean
# 'Taiwanese Restaurant',
# 'Tea Room',  # One Location, Chinese
# 'Thai Restaurant',
# 'Vietnamese Restaurant',
 'Wings Joint']


Let's see how many restaurants are in the categories we don't want, and drop them from the data frame.

In [0]:
print("Number of restaurants in unwanted categories (dropped): ", df_georgia[df_georgia['categories'].isin(bad_list)].shape[0])
df_georgia = df_georgia[~df_georgia['categories'].isin(bad_list)]
print("Restaurants remaining in dataset: ", df_georgia.shape[0])

Number of restaurants in unwanted categories (dropped):  1
Restaurants remaining in dataset:  2995


Now let's consolidate some categories. Some of the categories remaining in our database (such as Szechuan Restaurant) are sub-categories. For our purposes, we want to roll these up to the top-level category (Szechuan --> Chinese, for example). Other categories roll up because I looked at the restaurants in the category and found that they all should have been put in another category (for example, all Steakhouses in our database are Japanese). Now I will define lists that will help consolidate categories in these cases.

In [0]:
# Define sub-categories that will be consolidated into major categories.
roll_up = [[['Bubble Tea Shop','Cantonese Restaurant','Dim Sum Restaurant','Garden', \
            'Shanghai Restaurant','Szechuan Restaurant', 'Taiwanese Restaurant', \
            'Tea Room'], 'Chinese Restaurant'],
           [['Buffet','Japanese Curry Restaurant', 'Japanese Restaurant', \
            'Ramen Restaurant', 'Shabu-Shabu Restaurant','Steakhouse', \
            'Sushi Restaurant'], 'Japanese Restaurant'],
           [['Fried Chicken Joint','Hot Dog Joint', 'Taco Place'], 'Korean Restaurant'],
           [['Indian Chinese Restaurant','Indian Restaurant','Jiangsu Restaurant'], \
            'Indo-Pak Restaurant'],
           [['BBQ Joint', 'Filipino Restaurant','Food Truck','Hotpot Restaurant',\
            'Indonesian Restaurant','Karaoke Bar','Malay Restaurant', \
            'Mongolian Restaurant', 'Noodle House','Poke Place', \
            'Sandwich Place', 'Supermarket'], 'Asian Restaurant']]

In [0]:
# Consolidate categories

for cat in roll_up:
    df_georgia['categories'][df_georgia['categories'].isin(cat[0])] = cat[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
# Let's see how the restaurants are distributed now
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese Restaurant,1071,1071,1071,1071
Asian Restaurant,720,720,720,720
Japanese Restaurant,715,715,715,715
Korean Restaurant,214,214,214,214
Thai Restaurant,177,177,177,177
Vietnamese Restaurant,92,92,92,92
Indo-Pak Restaurant,6,6,6,6


##South Asian - Data Gathering & Cleaning
So far, we've only uncovered a handful of South Asian (Indian & Pakistani) restaurants. That's because Foursquare has separate categories for them. The restaurant supply company also serves Indian & Pakistani restaurants. Let's repeat the above analysis and merge the data. 

In [0]:
# Define the southwest and northeast corners of a box around the state of Georgia

fsw = (30.357851, -85.605165)  # Southwest corner, Georgia USA
fne = (35.000659, -80.839729)  # Northeast corner

# The following venue categories come from Foursquare. Note some 
# categories have sub-categories (for example, "Korean Restaurant" in "Asian").
restaurant_codes =  ['4bf58dd8d48988d10f941735',   # Indian Restaurant
                     '52e81612bcbc57f1066b79f8']    # Pakistani Restaurant   

# Request the data
df_venues, query_list = GetVenuesByBox((fsw,fne), restaurant_codes)
                     
# Drop restaurants not in Georgia.
df_south_asia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]

............................................

In [0]:
df_south_asia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]

In [0]:
# Create a list of categories we DON'T want. 
# List generated using this command: sorted(list(set(df_south_asia['categories'])))
# Categories to KEEP are commented OUT

bad_se = ['Arcade',
 'Bar',
 'Big Box Store',
 'Breakfast Spot',
 'Brewery',
# 'Chaat Place',
 'Chinese Restaurant',
 'Fast Food Restaurant',
# 'Food & Drink Shop',
 'Food Truck',
# 'Gourmet Shop',
 'Grocery Store',
 'Hotel',
# 'Indian Chinese Restaurant',
# 'Indian Restaurant',
# 'Indian Sweet Shop',
# 'Italian Restaurant',
 'Mediterranean Restaurant',
# 'North Indian Restaurant',
# 'Pakistani Restaurant',
# 'Pizza Place',
 'Sandwich Place',
# 'South Indian Restaurant',
 'Sports Bar',
# 'Tea Room',
 'Vegetarian / Vegan Restaurant']

# Drop all rows with a category we don't want
df_south_asia = df_south_asia[~df_south_asia['categories'].isin(bad_se)]

In [0]:
# Let's look at the number of restaurants by category
df_south_asia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Indian Restaurant,265,265,265,265
North Indian Restaurant,7,7,7,7
South Indian Restaurant,6,6,6,6
Indian Chinese Restaurant,5,5,5,5
Pakistani Restaurant,4,4,4,4
Chaat Place,3,3,3,3
Food & Drink Shop,1,1,1,1
Gourmet Shop,1,1,1,1
Indian Sweet Shop,1,1,1,1
Italian Restaurant,1,1,1,1


In [0]:
# Consolidate the categories into one - 'Indo-Pak Restaurant'

df_south_asia['categories'] = 'Indo-Pak Restaurant'
df_south_asia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Indo-Pak Restaurant,296,296,296,296


Looks good. There may be some mis-labeled restaurants in this dataframe, but we'll look at that after we merge the dataframes together.

In [0]:
# Merge the dataframes
df_georgia = df_georgia.append(df_south_asia)
df_georgia.reset_index(drop=True, inplace=True)
print(df_georgia.shape)


(3291, 5)


Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Indo-Pak Restaurant,296,296,296,296


In [0]:
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese Restaurant,1071,1071,1071,1071
Asian Restaurant,720,720,720,720
Japanese Restaurant,715,715,715,715
Indo-Pak Restaurant,302,302,302,302
Korean Restaurant,214,214,214,214
Thai Restaurant,177,177,177,177
Vietnamese Restaurant,92,92,92,92


Did performing two queries and merging them introduce duplicates? Let's find out and remove if so.

In [0]:
print("Before dropping duplicates: ",df_georgia.shape[0])
df_georgia = df_georgia.drop_duplicates(keep='first')
print("After dropping duplicates: ",df_georgia.shape[0])

Before dropping duplicates:  3291
After dropping duplicates:  3286


Now that we have all the restaurants in one place, let's take a closer look at the data and fix some inaccuracies.

##Fixing Inaccurate Categories

After looking through the data, it's clear that the categorization of the restaurants can be improved. For example, there are plenty of restaurants in the general "Asian" category that have "Japanese" or "Chinese" in their names. There are also some restaurants that are just in the wrong category - again, Chinese restaurants that are categorized as Japanese, etc. Let's see if we can fix this programatically, and with a minimum of manual effort.

I extracted the names of the restaurants in each category, and did a word frequency analysis to find terms that are common and specific to each restaurant type. For example, Vietnamese restaurant names often have these terms:

    vietnamese_terms = ['Pho', 'Saigon', 'Viet', 'Banh Mi']
    
Let's see if we can use those terms to correct some assignment problems in this database.

In [0]:
# Define terms commonly found in restaurant names for each country of origin.

china_terms = ['China', 'Chinese', 'Wok', 'Hong Kong', 'Panda', 'Peking', 
               'Beijing', 'Great Wall']
japan_terms = ['Japanese', 'Tokyo', 'Japan', 'Osaka', 
               'Shogun', 'Fuji', 'Sumo', 'Ichiban', 'Kobe', 'Sakura', 'Ramen', 
               'Teriyaki', 'Ninja', 'Shabu']
korea_terms = ['Korea', 'Gogi']
thailand_terms = ['Thai', 'Bangkok']
vietnam_terms = ['Pho', 'Saigon', 'Viet', 'Banh Mi']
indopak_terms = ['India', 'Bombay', 'Biryani', 'Naan', 'Masala']

term_list = [['Chinese Restaurant',china_terms],
             ['Japanese Restaurant',japan_terms],
             ['Korean Restaurant', korea_terms],
             ['Thai Restaurant', thailand_terms],
             ['Vietnamese Restaurant', vietnam_terms],
             ['Indo-Pak Restaurant', indopak_terms]]


In [0]:
# Use keywords to reassign restaurants into more accurate categories

# Iterate through the dataframe
for i in range(df_georgia.shape[0]):
    # For each row, see if any keyword appears in the restaurant name
    # If so, change the category
    for a_term in term_list:
        if any(x in df_georgia.loc[i,'name'] for x in a_term[1]):
            df_georgia.loc[i, 'categories'] = a_term[0]

Our data cleaning is complete. What does the cleaned database look like?

In [0]:
print(df_georgia.shape)
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False)

(3286, 5)


Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese Restaurant,1195,1195,1195,1195
Japanese Restaurant,789,789,789,789
Asian Restaurant,470,470,470,470
Indo-Pak Restaurant,293,293,293,293
Korean Restaurant,215,215,215,215
Thai Restaurant,203,203,203,203
Vietnamese Restaurant,121,121,121,121


In the next section, we will convert the categories to columns with integer values, in preparation to perform the k means analysis of the data.

##One Hot encoding

In [0]:
df_georgia.head()

Unnamed: 0,name,categories,lat,lng,state
0,Yuki Express,Asian Restaurant,30.903404,-84.537571,GA
1,Makan,Asian Restaurant,30.9038,-84.57547,GA
2,Star China Buffet,Chinese Restaurant,30.902765,-84.554162,GA
3,Yuki Express Japanese Restaurant,Japanese Restaurant,30.903799,-84.57547,GA
4,Jin's Chinese Buffet,Chinese Restaurant,30.891642,-84.207543,GA


In [12]:
df_georgia_dummies = pd.get_dummies(df_georgia['categories'])
print(df_georgia.shape)
print(df_georgia_dummies.shape)
df_georgia_dummies.head(10)

(3286, 5)
(3286, 7)


Unnamed: 0,Asian Restaurant,Chinese Restaurant,Indo-Pak Restaurant,Japanese Restaurant,Korean Restaurant,Thai Restaurant,Vietnamese Restaurant
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0
5,0,1,0,0,0,0,0
6,0,0,0,1,0,0,0
7,0,0,0,1,0,0,0
8,0,1,0,0,0,0,0
9,0,1,0,0,0,0,0


In [13]:
df_georgia_dummies.columns = ['Asian', 'Chinese', 'Indo-Pak',
       'Japanese', 'Korean', 'Thai',
       'Vietnamese']
df_georgia_dummies.head(10)

Unnamed: 0,Asian,Chinese,Indo-Pak,Japanese,Korean,Thai,Vietnamese
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0
5,0,1,0,0,0,0,0
6,0,0,0,1,0,0,0
7,0,0,0,1,0,0,0
8,0,1,0,0,0,0,0
9,0,1,0,0,0,0,0


In [0]:
df_georgia[list(df_georgia_dummies.columns)] = df_georgia_dummies

In [48]:
df_georgia.head()

Unnamed: 0,name,categories,lat,lng,state,Asian,Chinese,Indo-Pak,Japanese,Korean,Thai,Vietnamese
0,Yuki Express,Asian Restaurant,30.903404,-84.537571,GA,1,0,0,0,0,0,0
1,Makan,Asian Restaurant,30.9038,-84.57547,GA,1,0,0,0,0,0,0
2,Star China Buffet,Chinese Restaurant,30.902765,-84.554162,GA,0,1,0,0,0,0,0
3,Yuki Express Japanese Restaurant,Japanese Restaurant,30.903799,-84.57547,GA,0,0,0,1,0,0,0
4,Jin's Chinese Buffet,Chinese Restaurant,30.891642,-84.207543,GA,0,1,0,0,0,0,0


In [0]:
# DELETE PRIOR TO PROJECT COMPLETION
# Now let's save this as a csv file.
file_contents = df_georgia.to_csv(index=False)
with(open('georgia.csv', 'w')) as file1:
    file1.write(file_contents)

# Visualization
Before we create sales territories from the data, let's see where our restaurants are located.


In [39]:
fsw = (30.357851, -85.605165)  # Southwest corner, Georgia USA
fne = (35.000659, -80.839729)  # Northeast corner, Georgia USA

# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]# Visualize all queries on a map, with fill color indicating number of restaurants
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True)   


# heat_data = [[row['lat'],row['lng']] for index, row in df_georgia
#              #[df_georgia['categories'] == 'Chinese Restaurant']
#     .iterrows()]

# plugins.HeatMap(heat_data, radius=15).add_to(map_georgia)

# plugins.HeatMap(heat_data, gradient={.4: '#005500', .65: '#009900', 1: '#00FF00'}).add_to(map_georgia)


# GOOD SETTINGS TO SHOW KOREAN
heat_data = [[row['lat'],row['lng']] for index, row in df_georgia
             [df_georgia['categories'] == 'Korean Restaurant']
    .iterrows()]

plugins.HeatMap(heat_data, radius=15).add_to(map_georgia)

plugins.ScrollZoomToggler().add_to(map_georgia)

# GOOD SETTINGS FOR ALL RESTAURANTS
# heat_data = [[row['lat'],row['lng']] for index, row in df_georgia
#              #[df_georgia['categories'] == 'Chinese Restaurant']
#     .iterrows()]

# plugins.HeatMap(heat_data, radius=15).add_to(map_georgia)
    
map_georgia

In [0]:
# Create a dictionary to control colors on map

color_dict = {'Asian Restaurant':'cyan',
 'Chinese Restaurant':'pink',
 'Indo-Pak Restaurant': 'orange',
 'Japanese Restaurant':'yellow',
 'Korean Restaurant': 'blue',
 'Thai Restaurant': 'green',
 'Vietnamese Restaurant': 'red'}


In [0]:
# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]# Visualize all queries on a map, with fill color indicating number of restaurants
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True) 

plugins.ScrollZoomToggler().add_to(map_georgia)

for restaurant in df_georgia[df_georgia['categories'].isin(['Indo-Pak Restaurant','Thai Restaurant', 'Korean Restaurant','Vietnamese Restaurant'])].iterrows():
    folium.Circle(radius = 20, color = color_dict[restaurant[1]['categories']], 
                  location = [restaurant[1]['lat'],restaurant[1]['lng']]).add_to(map_georgia)
    
    
map_georgia

In [0]:
map_georgia

#Analysis

Let's start by seeing how the data would look if we just group the items by geographic location.

In [31]:
# set number of clusters. this represents a lot of trial-and-error.
kclusters = 3

georgia_clustering = df_georgia.drop(['name','categories','state'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(georgia_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
      dtype=int32)

In [32]:
georgia_clustering['cluster']=kmeans.labels_
georgia_clustering.head()


Unnamed: 0,lat,lng,cluster
0,30.903404,-84.537571,2
1,30.9038,-84.57547,2
2,30.902765,-84.554162,2
3,30.903799,-84.57547,2
4,30.891642,-84.207543,2


In [33]:
georgia_clustering.groupby(['cluster']).count()

Unnamed: 0_level_0,lat,lng
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2527,2527
1,413,413
2,346,346


In [0]:
set(df_georgia[df_georgia['categories'].isin(['Asian Restaurant'])]['name'])

In [0]:
"hello".contains('hel')

AttributeError: ignored

In [0]:
s = "Hello world how are you doing"
"world|howdy" in s

False

In [0]:
df_georgia[df_georgia['name'].str.contains('|'.join(term_list[0][1]))]['categories'] = term_list[0][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [0]:
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese Restaurant,1071,1071,1071,1071
Asian Restaurant,720,720,720,720
Japanese Restaurant,715,715,715,715
Indo-Pak Restaurant,297,297,297,297
Korean Restaurant,214,214,214,214
Thai Restaurant,177,177,177,177
Vietnamese Restaurant,92,92,92,92


In [0]:
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese Restaurant,1195,1195,1195,1195
Japanese Restaurant,789,789,789,789
Asian Restaurant,470,470,470,470
Indo-Pak Restaurant,293,293,293,293
Korean Restaurant,215,215,215,215
Thai Restaurant,203,203,203,203
Vietnamese Restaurant,121,121,121,121


In [0]:
list(set(df_georgia['categories']))

['Indo-Pak Restaurant',
 'Japanese Restaurant',
 'Korean Restaurant',
 'Chinese Restaurant',
 'Asian Restaurant',
 'Thai Restaurant',
 'Vietnamese Restaurant']

In [0]:
# THIS ROW DEMONSTRATES HOW TO FIND PARTIAL MATCHES.

df_whatsit = df_georgia[df_georgia['name'].str.contains('|'.join(thai_terms))]
df_whatsit.groupby('categories').count().sort_values(by=['name'], ascending=False)


Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Thai Restaurant,145,145,145,145
Asian Restaurant,21,21,21,21
Japanese Restaurant,6,6,6,6
Chinese Restaurant,2,2,2,2


In [0]:
df_whatsit[df_whatsit['categories'].isin(['Chinese Restaurant'])]

Unnamed: 0,name,categories,lat,lng,state
1066,Lucky Buddha Chinese Thai And Sushi,Chinese Restaurant,33.782012,-84.404881,GA
1839,The Pearl Chinese & Thai,Chinese Restaurant,34.022653,-84.270607,GA


In [0]:
for term in list(df_georgia['name'][df_georgia['categories'].isin(['Indo-Pak Restaurant'])]):
    print(term,end = " ")

Hsu's Gourmet Heart of India Restaurant Jade Dragon Carryout Dynasty Chinese Restaurant Paradise Biryani Pointe Simha Authentic Multi Cuisine Fine Dine Restaurant and Bar 

In [0]:
for term in list(df_georgia['name'][df_georgia['categories'].isin(['Asian Restaurant'])]):
    print(term,end = " ")

Yuki Express Makan Mori Japanese Steakhouse Beijing Cafe Super Hibachi Buffet Hunan House So.Ho. Cup Works Wild Wok Royal Buffet Lin's Hibachi Saigon Tokyo Raidas Asian Cafe Mikata Japanese Steakhouse & Sushi Bar Rice Bowl Cup O Rice N Wings Hong Kong Buffet Kings Deli Hibachi Grill Buffet Great Wall China Shuttle Coconut Asian Bistro Main Garden King Buffet Kobe Tokyo Express Kyoto Express Panda Express China Restaurant China One First Asia Express Asian Taste Lee New Hong Kong Wok N Roll Panda Express China Super Buffet Wok N Roll Chef Lee's Peking Restaurant Mikata Japanese Steakhouse & Sushi Bar Uptown Vietnam Cuisine Ok Sun Korean Restaurant Sarku Japan Hibachi Grill Supreme Buffet chef changs asian grill Main PX Manchu WOK China Express #1 Chinese Restaurant Great Wall Hungry Lee' Golden China Hibachi Express Mongo Grill Saigon Noodle House Hibachi Buffet Grill and Sushi Anna's Asian Cafe Yummy Express Wok N Roll Moon's Wings & Seafood Top China Asian House New China #8 Genji Jap

In [0]:
for term in list(df_south_asia['name']):
    print(term,end = " ")

Jazz Table Passage  2India Tandoor Blakes Kitchen Table Taste of India Bombay Grill Peacock Grill Bombay Bay Taste Of India Double D Fast Foods Curry Mantra Metropolis Grill Shahenshah Cuisine of India Peach County Ga Kali Mirck Taste of India Pakwan Indian Cuisine Naan Appetit Pak Wan Pakwan Naan Shivam Indian Grocery sk8 wings Himalayan Curry Kitchen Khalifa STAR OF INDIA Khalifa Indian Restaurant Indian Oven Masala Cottage Rasoi Indian Kitchen Dida's Orchard Taj Indian Restaurant Bombay City Little India Tasty Curry Chai Pani Zyka Cafe Bombay Masti Fun Indian Street Eats Tabla Niramish Fine Indian Dining Botiwalla NaanStop Chat Patti Cherians Amara Desi Spice Planet Bombay Indian Restaurant Gokul Sweets 2 Himalayan Spice Haveli Indian Cuisine Madras Mantra Aamar Indian Cuisine Copper Cove Indian Bistro Tin Drum Asian Kitchen - Lindbergh Plaza Turmeric Indian Bistro Gokul Sweets Mirch Masala Thali Vegetarian Indian Jai Ho Indian Kitchen & Bar Tava Indian Bistro Luqma Indo-Pak Restaur

In [0]:
set(df_south_asia['categories'])

{'Chaat Place',
 'Food & Drink Shop',
 'Gourmet Shop',
 'Indian Chinese Restaurant',
 'Indian Restaurant',
 'Indian Sweet Shop',
 'Italian Restaurant',
 'North Indian Restaurant',
 'Pakistani Restaurant',
 'Pizza Place',
 'South Indian Restaurant',
 'Tea Room'}

In [0]:

sorted(list(set(df_georgia['categories'])))

['Asian Restaurant',
 'Chinese Restaurant',
 'Indo-Pak Restaurant',
 'Japanese Restaurant',
 'Korean Restaurant',
 'Thai Restaurant',
 'Vietnamese Restaurant']

In [0]:
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Chinese Restaurant,1071,1071,1071,1071
Asian Restaurant,720,720,720,720
Japanese Restaurant,715,715,715,715
Korean Restaurant,214,214,214,214
Thai Restaurant,177,177,177,177
Vietnamese Restaurant,92,92,92,92
Indo-Pak Restaurant,6,6,6,6


In [0]:
str(list(df_georgia['name']))

'[\'Yuki Express\', \'Makan\', \'Star China Buffet\', \'Yuki Express Japanese Restaurant\', "Jin\'s Chinese Buffet", \'No1 China\', \'Yum Yum Express Cairo, Ga\', \'Mori Japanese Steakhouse\', \'Beijing Cafe\', \'Great Wall Chinese Restaurant\', \'Super Hibachi Buffet\', \'Hibachi Express\', \'Hunan House\', \'So.Ho.\', \'Cup Works\', \'Wild Wok\', \'Royal Buffet\', \'ichiban japanese steak house\', \'Masato Japanese Steakhouse & Sushi Bar\', \'Panda Express\', \'China Buffet\', "Keiko\'s", \'Peking Chinese Resturant\', \'Mandarin Express\', \'China Wok II\', \'Hong Yip Chinese Cuisine\', \'Garden Cafe\', \'Hibachi Grill & Buffet\', \'China Garden Chinese Restaurant\', "Ming\'s", \'China 1\', \'Chow Town Grill and Buffet\', \'Thai Cafe\', \'Osaka Hibachi and Sushi\', \'Aligatou\', \'Yummi Express\', \'Ichiban Japanese Steakhouse & Sushi Bar\', \'Masato Express\', "Mosato\'s", \'Osaka Hibachi & Sushi\', \'Thai Chang\', "Lin\'s Hibachi", \'Chens China\', \'Thai Mobile\', \'China King\', 

In [0]:
a_dictionary = {'Indonesian Restaurant': 'a',
 'Poke Place': 'b',
 'Noodle House': 'c',
 'Ramen Restaurant': 'd',
 'Bubble Tea Shop': 'e',
 'Indian Restaurant': 'f',
 'Steakhouse': 'g'}

In [0]:
a_dictionary['Noodle House']

'c'

In [0]:
df_georgia[['categories','name']].groupby('categories').count()\
.sort_values(by='name', ascending=False)

Unnamed: 0_level_0,name
categories,Unnamed: 1_level_1
Chinese Restaurant,1044
Asian Restaurant,655
Japanese Restaurant,449
Sushi Restaurant,243
Korean Restaurant,211
Thai Restaurant,177
Vietnamese Restaurant,92
Noodle House,32
Ramen Restaurant,14
Szechuan Restaurant,9


# READ CSV FILES HERE

In [0]:
df_south_asia = pd.read_csv('https://raw.githubusercontent.com/JamesDCage/Final-Week-0/master/south_asian.csv')
df_south_asia.shape

(296, 6)

In [11]:
df_georgia = pd.read_csv('https://raw.githubusercontent.com/JamesDCage/Final-Week-0/master/georgia.csv')
df_georgia.shape

(3286, 5)

# BLANK LINES FOR TESTING

In [0]:
df_georgia.describe(include='all')

Unnamed: 0,name,categories,lat,lng,state
count,3286,3286,3286.0,3286.0,3286
unique,2528,7,,,2
top,Panda Express,Chinese Restaurant,,,GA
freq,55,1195,,,3197
mean,,,33.540945,-83.917743,
std,,,0.808942,0.954609,
min,,,30.681651,-85.512472,
25%,,,33.471458,-84.385651,
50%,,,33.881895,-84.211962,
75%,,,33.993036,-83.958741,


In [0]:
df_south_asia[df_south_asia['categories'].isin(['Middle Eastern Restaurant'])].sort_values(by='categories')

Unnamed: 0,name,categories,lat,lng,state,s_asia


In [0]:
df_georgia[df_georgia['categories'].isin(['Soup Place'])].sort_values(by='categories')

Unnamed: 0,name,categories,lat,lng,state
2032,Juicy & Tasty,Soup Place,33.950916,-84.140539,GA


In [0]:
df_georgia.describe(include='all')

Unnamed: 0,name,categories,lat,lng,state
count,322,322,322.0,322.0,322
unique,296,26,,,2
top,Chinese Dhaba,Indian Restaurant,,,GA
freq,4,265,,,313
mean,,,33.749211,-84.111439,
std,,,0.601438,0.640293,
min,,,30.83518,-85.258985,
25%,,,33.796777,-84.3493,
50%,,,33.903164,-84.271429,
75%,,,34.035847,-84.164537,


In [0]:
set(df_georgia['categories'])

{'Arcade',
 'Bar',
 'Big Box Store',
 'Breakfast Spot',
 'Brewery',
 'Chaat Place',
 'Chinese Restaurant',
 'Fast Food Restaurant',
 'Food & Drink Shop',
 'Food Truck',
 'Gourmet Shop',
 'Grocery Store',
 'Hotel',
 'Indian Chinese Restaurant',
 'Indian Restaurant',
 'Indian Sweet Shop',
 'Italian Restaurant',
 'Mediterranean Restaurant',
 'North Indian Restaurant',
 'Pakistani Restaurant',
 'Pizza Place',
 'Sandwich Place',
 'South Indian Restaurant',
 'Sports Bar',
 'Tea Room',
 'Vegetarian / Vegan Restaurant'}

In [0]:
df_georgia[df_georgia['categories'].isin(['Ramen Restaurant'])]

Unnamed: 0,name,categories,lat,lng,state
329,Sumo,Ramen Restaurant,31.961258,-83.759125,GA
507,Akedo,Ramen Restaurant,32.05215,-81.10348,GA
1091,Ton Ton,Ramen Restaurant,33.773323,-84.365836,GA
1092,Ramen Station,Ramen Restaurant,33.74638,-84.370605,GA
1226,Taiyo Ramen,Ramen Restaurant,33.776722,-84.296559,GA
1227,Pao Pao Ramen Factory & Bar,Ramen Restaurant,33.815229,-84.311979,GA
1429,Momo San Ramen,Ramen Restaurant,33.954304,-84.55148,GA
1626,Jinya Ramen Bar,Ramen Restaurant,33.856618,-84.382846,GA
1761,Yebisuya,Ramen Restaurant,33.90814,-84.287104,GA
1991,Kumai Ramen,Ramen Restaurant,33.951736,-84.141161,GA


In [0]:
df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False).tail(19)

Unnamed: 0_level_0,name,lat,lng,state
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Big Box Store,2,2,2,2
Food Truck,2,2,2,2
Italian Restaurant,1,1,1,1
Tea Room,1,1,1,1
Sports Bar,1,1,1,1
Sandwich Place,1,1,1,1
Pizza Place,1,1,1,1
Mediterranean Restaurant,1,1,1,1
Arcade,1,1,1,1
Indian Sweet Shop,1,1,1,1


In [0]:
## CLUSTER MARKERS

fsw = (30.357851, -85.605165)  # Southwest corner, Georgia USA
fne = (35.000659, -80.839729)  # Northeast corner, Georgia USA

# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]# Visualize all queries on a map, with fill color indicating number of restaurants
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True)   

japanese_cluster = plugins.MarkerCluster().add_to(map_georgia)

for restaurant in df_georgia.iterrows():
    folium.Marker(location = [restaurant[1]['lat'],restaurant[1]['lng']]).add_to(japanese_cluster)

    
map_georgia