# Coursera Data Science Capstone Project
## Creating Sales Territories for B2B Restaurant Supplier
### James Cage, June 2019


## Table of Contents

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Libraries and Functions</a>

2. <a href="#item2">Explore Neighborhoods in New York City</a>

3. <a href="#item3">Analyze Each Neighborhood</a>

4. <a href="#item4">Cluster Neighborhoods</a>

5. <a href="#item5">Examine Clusters</a>    
</font>
</div>

# 1. Libraries and Functions

In [1]:
print("Importing ...")
import numpy as np # library to handle data in a vectorized manner
# import math   # for ceil function

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import json # library to handle JSON files

# #!conda install -c conda-forge geopy --yes # uncomment this line if needed
# from geopy.geocoders import Nominatim # convert address into latitude and longitude

import requests # library to handle requests

import time

# Matplotlib and associated plotting modules
# import matplotlib.cm as cm
# import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.8.3 --yes # uncomment this line if needed
import folium # map rendering library
from folium import plugins

print("Folium version: ", folium.__version__)

Importing ...
Folium version:  0.8.3


In [0]:
# @hidden cell
# Foursquare information

CLIENT_ID = 'CWCPNAVXDH3TI1BGS4VED4ANSUKEFGHBA4511GRPYPKPNJRD' # your Foursquare ID
CLIENT_SECRET = 'ALMFEYIENSPH3RV3TQB1NGWTKTANVJ5QTHAVZ5B1GRJWIP21' # your Foursquare Secret

In [0]:
# Constants

# Foursquare query information that will be constant during this project
VERSION = '20190214' # Foursquare API version
LIMIT = 120 

# Used to convert latitude and longitude to distances in miles
MILES_PER_LAT = 69    # A degree of latitude is 69 miles. 
MILES_PER_LNG = 50.5  # In Georgia, a degree of longitude averages 50.5 miles

In [0]:
# define function that extracts the category of the venue
# This function comes from Coursera Applied Data Science Capstone class
# used in Foursquare section

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
          
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [0]:
def split_box(box, purpose="number"):
    # Breaks a map "box" into smaller pieces
    # purpose = "size" - if the size of the box exceeds Foursquare's max size
    # purpose = "number" - if the number of returned results is 100 or more,
    #                      indicating that the query "maxed out"
    
    '''Foursquare limits bounding box requests to no more than 10,000 square 
    kilometers, or 3860 square miles. This is a box a bit larger than 
    60 miles on a side.'''
    
    MAX_SIZE_LENGTH = 60

    MAX_DELTA_LAT = MAX_SIZE_LENGTH/MILES_PER_LAT   # Max difference in latitude 
    MAX_DELTA_LNG = MAX_SIZE_LENGTH/MILES_PER_LNG   # Max difference in longitude 
    
    delta_lat = abs(box[0][0] - box[1][0])
    delta_lng = abs(box[0][1] - box[1][1])
    
    boxes = [] # Will hold 1 or more map boxes
    
    if purpose == "number":  # Splitting box because too many results returned
        lat_divisions = lng_divisions = 2 # Break the box into 4 sub-boxesx
        
    elif purpose == "size":  # Splitting because box is too large for Foursquare
        lat_divisions = int(np.ceil(delta_lat / MAX_DELTA_LAT))
        lng_divisions = int(np.ceil(delta_lng / MAX_DELTA_LNG))
        
    else:  
        raise ValueError("This function only supports 'size' and 'number' box splitting")
    
    # Create sub-boxes based on number of lat & lng divisions
    for i in range(lat_divisions):
        for j in range(lng_divisions):
            lat_side = delta_lat / lat_divisions
            lng_side = delta_lng / lng_divisions
            swij = (box[0][0] + i * lat_side, box[0][1] + j * lng_side)
            neij = (swij[0] + lat_side, swij[1] + lng_side)
            boxes.append((swij, neij))            
    
    return boxes   

In [0]:
def query_box(box, codes):
    print(".", end="") # Indicate activity while performing queries

  # define the Foursquare URL

  # Foursquare category ID requires comma-separated text string
    code_string = ",".join(codes)    
    
    url = f'https://api.foursquare.com/v2/venues/explore?&client_id={CLIENT_ID}' + \
        f'&client_secret={CLIENT_SECRET}&v={VERSION}&sw={box[0][0]},{box[0][1]}' + \
        f'&ne={box[1][0]},{box[1][1]}' + \
        f'&categoryId={code_string}&limit={LIMIT}'
  
  # submit the url and capture the returned text
    venues = requests.get(url).json()["response"]['groups'][0]['items']
  
  # if any venues are captured, process and return dataframe. Otherwise, return None. 
    if venues:
        venues = json_normalize(venues) # flatten JSON

        # select desired columns
        filtered_columns = ['venue.name', 'venue.categories', # 'venue.location.postalCode', 
                        'venue.location.lat', 'venue.location.lng', 'venue.location.state']
        venues = venues.loc[:, filtered_columns]

        # filter the category for each row
        venues['venue.categories'] = venues.apply(get_category_type, axis=1)

        # clean column names
        venues.columns = [col.split(".")[-1] for col in venues.columns]
  
        return venues
  
    else:
        return None  # no venues in box meet criteria    

In [0]:
def shrink_box(box, codes, venue_list, query_list):
    
    # Get venues for box (if any) in Pandas dataframe
    box_results = query_box(box, codes)

    try: # This will succeed if 1 or more venues are returned in box_results
        num_clients = box_results.shape[0]

        if num_clients < 100:  
            # Case where results did not "max out" at 100 returned venues
            venue_list += box_results.values.tolist()
            query_list.append((box, num_clients))

        else:  # This is the case where num_clients > 100
            for sub_box in split_box(box, purpose="number"):
                shrink_box(sub_box, codes, venue_list, query_list)

    except: # happens if query returns no results
        query_list.append((box, 0))

    return # venue_list and query_list are mutable, no need to return

In [0]:
def GetVenuesByBox(start_box,  # a tuple containing sw & ne corners of box
                   codes):     # a list of Foursquare category strings
    
    start_time = time.time()  # Start timer to show elapsed time

    venue_list = []      # list of venus of interest
    query_list = []      # boxes and number of venues in each, for visualization
    
    print('Requests in progress', end='')  # query_box() prints dots to indicate progress
    
    # if the start box is too large, split it into smaller boxes
    for box in split_box(start_box, purpose="size"):
        # List will contain one or more boxes. Obtain venues for each
        shrink_box(box, codes, venue_list, query_list)
        
    # Add venue data to a dataframe    
    venues = pd.DataFrame(venue_list, columns=['name', 'categories', # 'postalCode', \
                                                  'lat', 'lng', 'state'])
    
    print(f"\n{venues.shape[0]} venues returned from {len(query_list)} successful requests in {(time.time()-start_time):.2f} seconds.")
    return venues, query_list

# 2. Requesting Data from Foursquare

Foursquare allows you to request data by defining a bounding box using latitude & longitude coordinates. I want to request all Asian (including Indian & Pakistani) restaurants in a box drawn around the state of Georgia. 

There are two problems. First, Foursquare has a size limit (about 10,000 square kilometers). Georgia is much larger. Also, Foursquare will only return 100 venues per request (at least with my account type). My code handles these issues by dividing the box into smaller sub-boxes until all boxes are small enough for Foursquare. Then every time a query on a sub-box returns 100 results, the sub-box is split into 4 pieces and a new query is run on each one. This process repeats until all queries return <100 results. Any queries with 100 results are thrown away to avoid duplication.

Let's begin by drawing a large box around Georgia and requesting data. We will cover parts of the adjacent states but we will drop any rows that aren't in Georgia later.

In [0]:
# Define the southwest and northeast corners of a box around the state of Georgia

fsw = (30.357851, -85.605165)  # Southwest corner, Georgia USA
fne = (35.000659, -80.839729)  # Northeast corner

In [10]:
# The following venue categories come from Foursquare. Note that each 
# category has sub-categories (for example, "Korean Restaurant" in "Asian").
restaurant_codes =  ['4bf58dd8d48988d142941735'   # Asian Restaurant
                    ]  
# Request the data
df_venues, query_list = GetVenuesByBox((fsw,fne), restaurant_codes)

Requests in progress......................................................................................................................................................
4033 venues returned from 120 successful requests in 65.11 seconds.


## Data gathering process

Let's see how the code split the data request. Each box in the picture below represents a Foursquare query that returned 99 or fewer results. The shade of the box shows the number of returned values (you can click on the box to see the number). This is NOT a picture of the data -- we will see that in much more detail later.

In [11]:
# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]

# Find the maximum number of venues in any square (to calculate fill opacity)
max_clients = max([query[1] for query in query_list])

# Visualize all queries on a map, with fill color indicating number of restaurants
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True)

plugins.ScrollZoomToggler().add_to(map_georgia)
    
for box in query_list:
    sw, ne = box[0][0], box[0][1]
    opacity = box[1]/max_clients   # Clients in this box / max across all boxes          
    folium.Rectangle([sw, ne], 
                     popup=str(box[1]), color="black", opacity=0.99, fill=True, 
                     fill_color='orange', weight = 1, 
                     fill_opacity=(opacity)).add_to(map_georgia)
    
map_georgia

In the picture above you can see we started with a 6x5 grid, for 30 requests, to satisfy the size limit. After throwing away all queries returning 100 results (the max limit), how many successful queries did we end up with?

In [14]:
# Number of queries returning <100 results
len(query_list)

120

To get to a total of 120, boxes were split (120 - 30)/3 = 30 times. Data from those boxes were  thrown away, meaning 30 queries were wasted. The total number of queries to get my data from Georgia was 120 +  30 = 150. 

However, the smallest boxes are about 2 square miles in size. Georgia covers almost 60,000 square miles. **Over 25,000 queries would be needed to get this data with equal-sized boxes.** So this method is a bit better than that.

It generally takes 60-70 seconds to get the data in this way.

# 3. Data Cleaning 

Now let's look at the data we've obtained, and get rid of any venues we don't need for our analysis

In [16]:
print("Number of restaurants in data set: ",df_venues.shape[0],"\n\n")
df_venues.head()

Number of restaurants in data set:  4033 




Unnamed: 0,name,categories,lat,lng,state
0,Super Canton Chinese Restaurant,Chinese Restaurant,30.951445,-85.516435,FL
1,Fortune Cookie,Chinese Restaurant,30.783262,-85.24828,FL
2,New Star Chinese Restaurant,Asian Restaurant,30.754914,-85.549341,FL
3,King House,Asian Restaurant,30.442998,-85.05437,FL
4,Panda Buffet,Chinese Restaurant,30.774442,-85.223421,FL


My bounding-box approach obtained restaurants in parts of adjacent states. Let's get rid of those.

In [17]:
# Which states are included in the data set?
print(list(set(df_venues.state)))

['Alabama', 'SC', 'TN', 'FL', 'AL', 'GA', 'South Carolina', 'Georgia', 'Florida']


In [18]:
# Get rid of restaurants in other states and count the results.
df_georgia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]
print("Number of restaurants Georgia: ",df_georgia.shape[0],"\n\n")

Number of restaurants Georgia:  3115 




That's better. In theory, this approach would prevent duplication in the data set, but I always like to be sure. Let's check for duplicate venues.

In [19]:
print("Before dropping duplicates: ",df_georgia.shape[0])
df_georgia = df_georgia.drop_duplicates(keep='first')
print("After dropping duplicates: ",df_georgia.shape[0])

Before dropping duplicates:  3115
After dropping duplicates:  3115


Foursquare did a good job of giving us a unique data set. I also looked at the data and confirmed that all rows contain values for restaurant name, categories, lat, and lng. Now let's look at the restaurant categories. Did we only get the ones we wanted?

In [20]:
df_table = df_georgia[['categories','name']].groupby('categories').count().sort_values(by=['name'], ascending=False)
df_table.index.names, df_table.columns = ['Category'], ['Number']
df_table

Unnamed: 0_level_0,Number
Category,Unnamed: 1_level_1
Chinese Restaurant,1068
Asian Restaurant,666
Japanese Restaurant,455
Sushi Restaurant,246
Korean Restaurant,213
Thai Restaurant,178
Vietnamese Restaurant,93
Noodle House,32
Ramen Restaurant,14
Bakery,9


Convenience store? Italian restaurant? Garden? American restaurant? Clearly we need to get rid of some of these venues. In the cell below, I go through the list of categories and define a list of ones we want to get rid of. I generated the list using this command:

    sorted(list(set(df_georgia['categories'])))

In many cases, I used a command like the following to see which restaurants were in a suspicious category: 

     df_georgia[df_georgia['categories'].isin(['Taco Place'])]

You can see my comments in the lines below.

In [0]:
bad_list = ['American Restaurant',
# 'Asian Restaurant',
# 'BBQ Joint',  # Three venues, mixed Asian
 'Bakery',
 'Bar',
 'Breakfast Spot',
# 'Bubble Tea Shop',
# 'Buffet',   # One, Japanese
 'Café',
# 'Cantonese Restaurant',
 'Caribbean Restaurant',
# 'Chinese Restaurant',
 'Cocktail Bar',
 'Coffee Shop',
 'Deli / Bodega',
# 'Dim Sum Restaurant',
 'Dumpling Restaurant',  # Two restaurants, both Korean, both permanently closed
 'Fast Food Restaurant',
# 'Filipino Restaurant',
 'Food Court',
# 'Food Truck',
# 'Fried Chicken Joint',  # One, Korean
# 'Garden',   # One instance. Chinese restaurant.
 'Gas Station',
 'Grocery Store',
# 'Hot Dog Joint',  # One instance, Korean fried hotdogs. Must. Try. This.
 'Hotel Bar',
# 'Hotpot Restaurant',
# 'Indian Chinese Restaurant',
# 'Indian Restaurant',
# 'Indonesian Restaurant',  # Four instances. Roll into "Asian"
# 'Japanese Curry Restaurant',
# 'Japanese Restaurant',
# 'Jiangsu Restaurant',
# 'Karaoke Bar',
# 'Korean Restaurant',
# 'Malay Restaurant',  # Five instances. Roll into "Asian"
 'Mexican Restaurant',
 'Middle Eastern Restaurant',
# 'Mongolian Restaurant',  Roll into "Asian"
# 'Noodle House',
# 'Poke Place',  # Hawaiian, but we'll call it Japanese
# 'Ramen Restaurant',
 'Restaurant',
 'Salon / Barbershop',
# 'Sandwich Place',  # Two. One is Korean, one is not Asian
 'Seafood Restaurant',
# 'Shabu-Shabu Restaurant',
# 'Shanghai Restaurant',
 'Soup Place',   # One instance, permanently closed
 'Sports Bar',
# 'Steakhouse',  # Two. Both Japanese
# 'Supermarket',  # Four. All have restaurants inside
# 'Sushi Restaurant',
# 'Szechuan Restaurant',
# 'Taco Place', # One location "Hankook Taqueria" Korean
# 'Taiwanese Restaurant',
# 'Tea Room',  # One Location, Chinese
# 'Thai Restaurant',
# 'Vietnamese Restaurant',
 'Wings Joint']


Let's see how many restaurants are in the categories we don't want, and drop them from the data frame.

In [22]:
print("Number of restaurants in unwanted categories (dropped): ", df_georgia[df_georgia['categories'].isin(bad_list)].shape[0])
df_georgia = df_georgia[~df_georgia['categories'].isin(bad_list)]
print("Restaurants remaining in dataset: ", df_georgia.shape[0])

Number of restaurants in unwanted categories (dropped):  71
Restaurants remaining in dataset:  3044


Now let's consolidate some categories. Some of the categories remaining in our database (such as Szechuan Restaurant) are sub-categories. For our purposes, we want to roll these up to the top-level category (Szechuan --> Chinese, for example). Other categories roll up because I looked at the restaurants in the category and found that they all should have been put in another category (for example, all Steakhouses in our database are Japanese). Now I will define lists that will help consolidate categories in these cases.

In [0]:
# Define sub-categories that will be consolidated into major categories.
roll_up = [[['Bubble Tea Shop','Cantonese Restaurant','Dim Sum Restaurant','Garden', \
            'Shanghai Restaurant','Szechuan Restaurant', 'Taiwanese Restaurant', \
            'Tea Room', 'Hunan Restaurant'], 'Chinese Restaurant'],
           [['Buffet','Japanese Curry Restaurant', 'Japanese Restaurant', \
            'Ramen Restaurant', 'Shabu-Shabu Restaurant','Steakhouse', \
            'Sushi Restaurant'], 'Japanese Restaurant'],
           [['Fried Chicken Joint','Hot Dog Joint', 'Taco Place'], 'Korean Restaurant'],
           [['Indian Chinese Restaurant','Indian Restaurant','Jiangsu Restaurant'], \
            'Indo-Pak Restaurant'],
           [['BBQ Joint', 'Filipino Restaurant','Food Truck','Hotpot Restaurant',\
            'Indonesian Restaurant','Karaoke Bar','Malay Restaurant', \
            'Mongolian Restaurant', 'Noodle House','Poke Place', \
            'Sandwich Place', 'Supermarket'], 'Asian Restaurant']]

In [0]:
# Consolidate categories

for cat_list in roll_up:
    df_georgia.loc[df_georgia.categories.isin(cat_list[0]), 'categories'] = cat_list[1]

In [25]:
# Let's see how the restaurants are distributed now
df_table = df_georgia[['categories','name']].groupby('categories').count().sort_values(by=['name'], ascending=False)
df_table.index.names, df_table.columns = ['Category'], ['Number']
df_table

Unnamed: 0_level_0,Number
Category,Unnamed: 1_level_1
Chinese Restaurant,1096
Asian Restaurant,731
Japanese Restaurant,724
Korean Restaurant,216
Thai Restaurant,178
Vietnamese Restaurant,93
Indo-Pak Restaurant,6


## South Asian - Data Gathering & Cleaning
So far, we've only uncovered a handful of South Asian (Indian & Pakistani) restaurants. That's because Foursquare has separate categories for them. The restaurant supply company also serves Indian & Pakistani restaurants. Let's repeat the above analysis and merge the data. 

In [28]:
# The following venue categories come from Foursquare. Note some 
# categories have sub-categories (for example, "Korean Restaurant" in "Asian").
restaurant_codes =  ['4bf58dd8d48988d10f941735',   # Indian Restaurant
                     '52e81612bcbc57f1066b79f8']    # Pakistani Restaurant   

# Request the data
df_venues, query_list = GetVenuesByBox((fsw,fne), restaurant_codes)
                     
# Drop restaurants not in Georgia.
df_south_asia = df_venues[df_venues['state'].isin(['GA', 'Georgia'])]

Requests in progress......................................
388 venues returned from 36 successful requests in 13.55 seconds.


In [0]:
# Create a list of categories we DON'T want. 
# List generated using this command: sorted(list(set(df_south_asia['categories'])))
# Categories to KEEP are commented OUT

bad_se = ['Arcade',
 'Bar',
 'Big Box Store',
 'Breakfast Spot',
 'Brewery',
# 'Chaat Place',
 'Chinese Restaurant',
 'Fast Food Restaurant',
# 'Food & Drink Shop',
 'Food Truck',
# 'Gourmet Shop',
 'Grocery Store',
 'Hotel',
# 'Indian Chinese Restaurant',
# 'Indian Restaurant',
# 'Indian Sweet Shop',
 'Italian Restaurant',
 'Mediterranean Restaurant',
# 'North Indian Restaurant',
# 'Pakistani Restaurant',
 'Pizza Place',
 'Sandwich Place',
# 'South Indian Restaurant',
 'Sports Bar',
# 'Tea Room',
 'Vegetarian / Vegan Restaurant']

# Drop all rows with a category we don't want
df_south_asia = df_south_asia[~df_south_asia['categories'].isin(bad_se)]

In [30]:
# Let's look at the number of restaurants by category
df_table = df_south_asia[['categories','name']].groupby('categories').count().sort_values(by=['name'], ascending=False)
df_table.index.names, df_table.columns = ['Category'], ['Number']
df_table

Unnamed: 0_level_0,Number
Category,Unnamed: 1_level_1
Indian Restaurant,269
North Indian Restaurant,7
South Indian Restaurant,6
Indian Chinese Restaurant,5
Pakistani Restaurant,4
Chaat Place,3
Food & Drink Shop,1
Gourmet Shop,1
Indian Sweet Shop,1
Tea Room,1


In [31]:
# Consolidate the categories into one - 'Indo-Pak Restaurant'

df_south_asia['categories'] = 'Indo-Pak Restaurant'
df_table = df_south_asia[['categories','name']].groupby('categories').count().sort_values(by=['name'], ascending=False)
df_table.index.names, df_table.columns = ['Category'], ['Number']
df_table

Unnamed: 0_level_0,Number
Category,Unnamed: 1_level_1
Indo-Pak Restaurant,298


Looks good. There may be some mis-labeled restaurants in this dataframe, but we'll look at that after we merge the dataframes together.

In [32]:
# Merge the dataframes
df_georgia = df_georgia.append(df_south_asia)
df_georgia.reset_index(drop=True, inplace=True)
print(df_georgia.shape)


(3342, 5)


In [33]:
# Confirm merger

df_table = df_georgia[['categories','name']].groupby('categories').count().sort_values(by=['name'], ascending=False)
df_table.index.names, df_table.columns = ['Category'], ['Number']
df_table

Unnamed: 0_level_0,Number
Category,Unnamed: 1_level_1
Chinese Restaurant,1096
Asian Restaurant,731
Japanese Restaurant,724
Indo-Pak Restaurant,304
Korean Restaurant,216
Thai Restaurant,178
Vietnamese Restaurant,93


Did performing two queries and merging them introduce duplicates? Let's find out and remove if so.

In [34]:
print("Before dropping duplicates: ",df_georgia.shape[0])
df_georgia = df_georgia.drop_duplicates(keep='first')
df_georgia.reset_index(drop=True, inplace=True)
print("After dropping duplicates: ",df_georgia.shape[0])

Before dropping duplicates:  3342
After dropping duplicates:  3337


Now that we have all the restaurants in one place, let's take a closer look at the data and fix some inaccuracies.

## Fixing Inaccurate Categories

After looking through the data, it's clear that the categorization of the restaurants can be improved. For example, there are plenty of restaurants in the general "Asian" category that have "Japanese" or "Chinese" in their names. There are also some restaurants that are just in the wrong category - again, Chinese restaurants that are categorized as Japanese, etc. Let's see if we can fix this programatically, and with a minimum of manual effort.

I extracted the names of the restaurants in each category, and did a word frequency analysis to find terms that are common and specific to each restaurant type. For example, Vietnamese restaurant names often have these terms:

    vietnamese_terms = ['Pho', 'Saigon', 'Viet', 'Banh Mi']
    
Let's see if we can use those terms to correct some assignment problems in this database.

In [0]:
# Define terms commonly found in restaurant names for each country of origin.

china_terms = ['China', 'Chinese', 'Wok', 'Hong Kong', 'Panda', 'Peking', 
               'Beijing', 'Great Wall']
japan_terms = ['Japanese', 'Tokyo', 'Japan', 'Osaka', 
               'Shogun', 'Fuji', 'Sumo', 'Ichiban', 'Kobe', 'Sakura', 'Ramen', 
               'Teriyaki', 'Ninja', 'Shabu']
korea_terms = ['Korea', 'Gogi']
thailand_terms = ['Thai', 'Bangkok']
vietnam_terms = ['Pho', 'Saigon', 'Viet', 'Banh Mi']
indopak_terms = ['India', 'Bombay', 'Biryani', 'Naan', 'Masala']

term_list = [['Chinese Restaurant',china_terms],
             ['Japanese Restaurant',japan_terms],
             ['Korean Restaurant', korea_terms],
             ['Thai Restaurant', thailand_terms],
             ['Vietnamese Restaurant', vietnam_terms],
             ['Indo-Pak Restaurant', indopak_terms]]

In [0]:
# Use keywords to reassign restaurants into more accurate categories
# Reset index to ensure it is unique and continuous
df_georgia.reset_index(drop=True, inplace = True)
# Iterate through the dataframe
for i in range(df_georgia.shape[0]):
    # For each row, see if any keyword appears in the restaurant name
    # If so, change the category
    for a_term in term_list:
        if any(x in df_georgia.loc[i,'name'] for x in a_term[1]):
            df_georgia.loc[i, 'categories'] = a_term[0]

Our data cleaning is complete. What does the cleaned database look like?

In [37]:
print(df_georgia.shape)
df_table = df_georgia[['categories','name']].groupby('categories').count().sort_values(by=['name'], ascending=False)
df_table.index.names, df_table.columns = ['Category'], ['Number']
df_table

(3337, 5)


Unnamed: 0_level_0,Number
Category,Unnamed: 1_level_1
Chinese Restaurant,1223
Japanese Restaurant,800
Asian Restaurant,476
Indo-Pak Restaurant,295
Korean Restaurant,217
Thai Restaurant,204
Vietnamese Restaurant,122


Notice that there are fewer 'Asian' restaurants and more restaurants in other categories, especially the less common ones. Now that we have our data, let's get an idea of where the restaurants are in the state.

# 4. Visualization
Before we create sales territories from the data, let's see where our restaurants are located.


In [38]:
# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True)   

heat_data = [[row['lat'],row['lng']] for index, row in df_georgia.iterrows()]

plugins.HeatMap(heat_data, radius=14).add_to(map_georgia)

# DELETE PRIOR
plugins.ScrollZoomToggler().add_to(map_georgia)
    
map_georgia

The heatmap shows that bulk of restaurants are in the greater Atlanta area, but it also seems to imply that the highest concentrations are outside the area. Further, I see different results depending on when I run this Jupyter notebook -- sometimes Columbus shows up as a major concentration, and other times it's Warner Robins or Savannah. Fortunately, Folium has another way to see thie information.

In [39]:
# Use cluster markers to show distribution of Asian restaurants in the state.
print(f'Number of Asian restaurants in Georgia: {df_georgia.shape[0]}')

# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]

map_georgia = folium.Map(location=center, zoom_start=7,  control_scale=True)

# Create Marker Cluster
all_cluster = plugins.MarkerCluster(control=False)
map_georgia.add_child(all_cluster)

sub_group_dict = {}  # For marker cluster sub-groups

# Create a list of categories, ordered by most-to-least popular
category_list = list(df_georgia.groupby('categories').count().sort_values(by=['name'], ascending=False).index)

# Create a sub-group for each type of restaurant
for category in category_list:
    sub_group_dict[category] = plugins.FeatureGroupSubGroup(all_cluster,category)
    map_georgia.add_child(sub_group_dict[category])

#DELETE SCROLLZOOMTOGGLER PRIOR TO SUBMITTING PROJECT
plugins.ScrollZoomToggler().add_to(map_georgia)

# Assign each restaurant to the appropriate sub-group
for restaurant in df_georgia.iterrows():
    folium.Marker(location = [restaurant[1]['lat'],restaurant[1]['lng']]
                  #, popup=restaurant[1]['name']  # Overloads system to add this
                 ).add_to(sub_group_dict[restaurant[1]['categories']])

# Display layer control
folium.LayerControl(collapsed=False).add_to(map_georgia)
map_georgia

Number of Asian restaurants in Georgia: 3337


This clarifies the picture. Of the roughly 3300 Asian restaurants in the state, over 2400 are in the greater Atlanta area, with a strong concentration in the communities to the Northeast along Interstate 85. The category checkboxes allow us to see how the distribution varies by type of restaurant. While all types are concentrated in the population centers around Atlanta, Chinese and Japanese restaurants are more evenly distributed around the state. They are also the most popular categories.

Now let's divide the potential customer base into sales territories for our restaurant supply company.

# 5. Analysis

We will find our sales territories by using k means clustering. To do that, we must have numeric values for each category we use for the clustering analysis. We use one-hot encoding to convert the "categories" column to a group of columns for each possible value in "categories". 

## One-hot encoding

The restaurant supply company wants to create sales territories based on the physical location and types of restaurants. We have the physical location as latitude and longitude coordinates. (We will look at how to scale this information in the next section.) To use they type of restaurant (such as "Korean Restaurant" or "Chinese Restaurant") in an analysis, we must convert the information into numeric values. We do this by creating a new column for each type of entry in the 'categories' column and assigning a value of 1 when the category matches the column. This is called on-hot encoding.

In [107]:
# Create the one-hot dummy columns as a separate dataframe, 
# then check the sizes of the original and dummies dataframes.

df_georgia_dummies = pd.get_dummies(df_georgia['categories'])
print(df_georgia.shape)
print(df_georgia_dummies.shape)
df_georgia_dummies.head(10)

(3337, 14)
(3337, 7)


Unnamed: 0,Asian Restaurant,Chinese Restaurant,Indo-Pak Restaurant,Japanese Restaurant,Korean Restaurant,Thai Restaurant,Vietnamese Restaurant
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0
5,0,0,0,1,0,0,0
6,0,0,0,1,0,0,0
7,1,0,0,0,0,0,0
8,1,0,0,0,0,0,0
9,0,1,0,0,0,0,0


In [108]:
# Change the column names to make them a bit easier to display

df_georgia_dummies.columns = ['Asian', 'Chinese', 'Indo-Pak',
       'Japanese', 'Korean', 'Thai','Vietnamese']
df_georgia_dummies.head(10)

Unnamed: 0,Asian,Chinese,Indo-Pak,Japanese,Korean,Thai,Vietnamese
0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0
5,0,0,0,1,0,0,0
6,0,0,0,1,0,0,0
7,1,0,0,0,0,0,0
8,1,0,0,0,0,0,0
9,0,1,0,0,0,0,0


In [109]:
# Now add the new columns to the original dataframe

df_georgia[list(df_georgia_dummies.columns)] = df_georgia_dummies
df_georgia.head()

Unnamed: 0,name,categories,lat,lng,state,Asian,Chinese,Indo-Pak,Japanese,Korean,Thai,Vietnamese,cluster,color
0,Yuki Express,Asian Restaurant,30.903404,-84.537571,GA,1,0,0,0,0,0,0,0,green
1,Makan,Asian Restaurant,30.9038,-84.57547,GA,1,0,0,0,0,0,0,0,green
2,Jin's Chinese Buffet,Chinese Restaurant,30.891642,-84.207543,GA,0,1,0,0,0,0,0,0,green
3,Star China Buffet,Chinese Restaurant,30.902765,-84.554162,GA,0,1,0,0,0,0,0,0,green
4,No1 China,Chinese Restaurant,30.886484,-84.205227,GA,0,1,0,0,0,0,0,0,green


In [0]:
# DELETE PRIOR TO PROJECT COMPLETION
# Now let's save this as a csv file.
file_contents = df_georgia.to_csv(index=False)
with(open('georgia.csv', 'w')) as file1:
    file1.write(file_contents)

## Scaling Latitude & Longitude

We will attempt to use both location data (stored as latitude and longitude) and restaurant cuisine type ("Chinese", "Korean", etc.) in our clustering to form sales territories. But how can we do that and ensure that the different types of data are weighted appropriately in our analysis? 

I begin by recognizing that a degree of latitude and a degree of longitude represent different distances. Worldwide, a degree of latitude is about 69 miles. In Georgia, about 50.5 miles separate degrees of longitude. These constants were defined globally in this project (see top of this notebook) and used in my function 'split_box()'. Now we can define x and y coordinates, and we can make each unit represent as many miles as we want. But how many miles should a 1 unit change in x and y represent?

My approach is to ask the sales manager for the restaurant supply company the following question:

"Suppose that a sales person is most familiar with on type of restaurant (for example, Korean). How much further should the sales person go to see a Korean restaurant instead of seeing another type of restaurant that is very close to the sales person's office?"

For this customer, the answer is 60 miles. (This takes into account that a straight-line distance is shorter than the actual driving distance.) Therefore, we will convert latitude and longitude to X and Y, and make each unit equal to 60 miles. In this way, a one-unit change in coordinates will have the same impact on the analysis as a one-unit change in restaurant type. 

Let's try this approach.

In [111]:
# Scale and convert latitude and longitude to consistent X and Y coordinates.

# The following constants are defined at the top of this project 
# MILES_PER_LAT = 69.0   # A degree of latitude is 69 miles. 
# MILES_PER_LNG = 50.5 # A degree of longitude is 50.5 miles on the average in Georgia

miles_per_xy =  60 # Additional distance a sales person will drive to visit customer in preferred cuisine

# Create dataframe for clustering Use "try" statement in case I've done this before
try:
    georgia_clustering = df_georgia.drop(['name','categories','state','lat','lng','cluster', 'color'], 1)
except:
    georgia_clustering = df_georgia.drop(['name','categories','state','lat','lng'], 1)
    
# Scale up/down latitude & longitude to adjust relative importance of driving time.
georgia_clustering['X'] = df_georgia['lat'] * MILES_PER_LAT / miles_per_xy
georgia_clustering['Y'] = df_georgia['lng'] * MILES_PER_LNG / miles_per_xy 

georgia_clustering.head()

Unnamed: 0,Asian,Chinese,Indo-Pak,Japanese,Korean,Thai,Vietnamese,X,Y
0,1,0,0,0,0,0,0,35.538914,-71.152456
1,1,0,0,0,0,0,0,35.53937,-71.184354
2,0,1,0,0,0,0,0,35.525389,-70.874682
3,0,1,0,0,0,0,0,35.53818,-71.166419
4,0,1,0,0,0,0,0,35.519457,-70.872732


Now let's cluster the restaurants and see what we get.

In [113]:
# Set the number of clusters (equal to the number of salespeople). 
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=1).fit(georgia_clustering)

# add cluster labels to dataframe and take a look

df_georgia['cluster']=kmeans.labels_
df_georgia.head()

Unnamed: 0,name,categories,lat,lng,state,Asian,Chinese,Indo-Pak,Japanese,Korean,Thai,Vietnamese,cluster,color
0,Yuki Express,Asian Restaurant,30.903404,-84.537571,GA,1,0,0,0,0,0,0,1,green
1,Makan,Asian Restaurant,30.9038,-84.57547,GA,1,0,0,0,0,0,0,1,green
2,Jin's Chinese Buffet,Chinese Restaurant,30.891642,-84.207543,GA,0,1,0,0,0,0,0,1,green
3,Star China Buffet,Chinese Restaurant,30.902765,-84.554162,GA,0,1,0,0,0,0,0,1,green
4,No1 China,Chinese Restaurant,30.886484,-84.205227,GA,0,1,0,0,0,0,0,1,green


In [0]:
# Create a dictionary to control colors on map
color_dict = {
 0: 'green',
 1: 'red',
 2: 'blue',
 3: 'orange',
 4: 'cyan',
 5: 'pink',
 6: 'yellow',
 7: 'brown'}


In [115]:
# Display sales territories. Use color instead of integer to make it easier to understand map below

df_georgia['color'] = df_georgia['cluster'].apply(lambda x : color_dict[x])

pd.crosstab(df_georgia.categories, df_georgia.color, #colnames = [0, 1, 2, 3, 'Total'],
                       margins=True, margins_name="Total")

color,blue,cyan,green,orange,red,Total
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Asian Restaurant,0,392,0,51,33,476
Chinese Restaurant,901,0,0,174,148,1223
Indo-Pak Restaurant,0,264,0,15,16,295
Japanese Restaurant,0,0,582,129,89,800
Korean Restaurant,0,196,0,11,10,217
Thai Restaurant,0,164,0,21,19,204
Vietnamese Restaurant,0,107,0,8,7,122
Total,901,1123,582,409,322,3337


In [120]:
# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True) 

plugins.ScrollZoomToggler().add_to(map_georgia)

# Plot results. Skip every 8th row due to folium performance limits
for restaurant in df_georgia[df_georgia.index % 8 != 0].iterrows():
    color = color_dict[restaurant[1]['cluster']]
    opacity=0.5
    folium.Circle(radius = 200, color=color, fill_color=color, opacity=opacity, fill_opacity=opacity,
                  location = [restaurant[1]['lat'],restaurant[1]['lng']]).add_to(map_georgia)
map_georgia

In [155]:
# Place markers showing center of each type of restaurant in the North Georgia unequal territories. 

df_ga_north = df_georgia[df_georgia.color.isin(['cyan','green'])]

# Center the map on the markers 
center = (df_ga_north.lat.mean(),df_ga_north.lng.mean())
map_georgia = folium.Map(location=center, zoom_start=13, control_scale=True) 

plugins.ScrollZoomToggler().add_to(map_georgia)

for category in set(df_ga_north.categories):
    location=(df_ga_north[df_ga_north.categories == category].lat.mean(),
              df_ga_north[df_ga_north.categories == category].lng.mean())
    popup = folium.Popup(html=category, show=True, sticky=True, parse_html=True)
    folium.Marker(location = location,
              icon=folium.Icon(color='green'),popup=popup,
             ).add_to(map_georgia)


map_georgia

In [121]:
df_georgia.lat.mean()

33.54460334839832

In [124]:
df_georgia[df_georgia.categories == "Chinese Restaurant"].lat.mean()

33.47028726433757

In [144]:
help(folium.Popup)

Help on class Popup in module folium.map:

class Popup(branca.element.Element)
 |  Create a Popup instance that can be linked to a Layer.
 |  
 |  Parameters
 |  ----------
 |  html: string or Element
 |      Content of the Popup.
 |  parse_html: bool, default False
 |      True if the popup is a template that needs to the rendered first.
 |  max_width: int for pixels or text for percentages, default '100%'
 |      The maximal width of the popup.
 |  show: bool, default False
 |      True renders the popup open on page load.
 |  sticky: bool, default False
 |      True prevents map and other popup clicks from closing.
 |  
 |  Method resolution order:
 |      Popup
 |      branca.element.Element
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, html=None, parse_html=False, max_width='100%', show=False, sticky=False)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  render(self, **kwargs)
 |      Renders the HTML representation 

# READ CSV FILES HERE

In [0]:
df_south_asia = pd.read_csv('https://raw.githubusercontent.com/JamesDCage/Final-Week-0/master/south_asian.csv')
df_south_asia.shape

(296, 6)

In [0]:
df_georgia = pd.read_csv('https://raw.githubusercontent.com/JamesDCage/Final-Week-0/master/georgia.csv')
df_georgia.shape

(3288, 12)

# BLANK LINES FOR TESTING

In [0]:
# Create a dictionary to control colors on map

color_dict = {'Asian Restaurant':'blue',
 'Chinese Restaurant':'red',
 'Indo-Pak Restaurant': 'orange',
 'Japanese Restaurant':'green',
 'Korean Restaurant': 'cyan',
 'Thai Restaurant': 'yellow',
 'Vietnamese Restaurant': 'black'}


In [0]:
# Experiment in tryhing to put markers on map for restaurants. Too many markers, doesn't show up clearly. 

# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]# Visualize all queries on a map, with fill color indicating number of restaurants
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True) 

plugins.ScrollZoomToggler().add_to(map_georgia)

# Skip every 8th row due to Folium performance limits
for restaurant in df_georgia[df_georgia.index % 8 != 0].iterrows():
    folium.Circle(radius = 200, color = color_dict[restaurant[1]['categories']], 
                  opacity = 0.5, fill_color = color_dict[restaurant[1]['categories']], fill_opacity = 0.25,
                  location = [restaurant[1]['lat'],restaurant[1]['lng']]).add_to(map_georgia)
    
    
map_georgia

In [0]:
# SHOW DENSITY AS BOX COLOR
# Basically, nothing shows up but my neighborhood.

# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]

# Find the maximum number of venues in any square (to calculate fill opacity)
max_clients = max([query[1] for query in query_list])
min_box = min([(query[0][0][0]-query[0][1][0])*(query[0][0][1]-query[0][1][1]) for query in query_list])

# Visualize all queries on a map, with fill color indicating number of restaurants
map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True)

plugins.ScrollZoomToggler().add_to(map_georgia)
    
for box in query_list:
    sw, ne = box[0][0], box[0][1]
    # Calculate area
    area = (ne[0]-sw[0])*(ne[1]-sw[1])
#     opacity = box[1]/max_clients   # Clients in this box / max across all boxes
    opacity = (box[1] * min_box)/(max_clients * area)
    folium.Rectangle([sw, ne], 
                     popup=str(box[1]), color="black", opacity=0.99, fill=True, 
                     fill_color='red', weight = 1, 
                     fill_opacity=(opacity)).add_to(map_georgia)
    
map_georgia

In [0]:
# CLUSTER MARKERS THAT AREN'T SEPARATD BY CATEGORY

# Use cluster markers to show distribution of Asian restaurants in the state.
print(f'Number of Asian restaurants in Georgia: {df_georgia.shape[0]}')

# Center the map on the middle of the bounding box defined above 
center = [(fsw[0]+fne[0])/2, (fsw[1]+fne[1])/2]

map_georgia = folium.Map(location=center, zoom_start=7, control_scale=True)   

#DELETE SCROLLZOOMTOGGLER PRIOR TO SUBMITTING PROJECT
plugins.ScrollZoomToggler().add_to(map_georgia)

all_cluster = plugins.MarkerCluster().add_to(map_georgia)

for restaurant in df_georgia.iterrows():
    folium.Marker(location = [restaurant[1]['lat'],restaurant[1]['lng']]
                  #, popup=restaurant[1]['name']
                 ).add_to(all_cluster)
    
map_georgia

# New Section