In [None]:
#Target City: Seattle, Washington

In [None]:
#Research Question: What district in Seattle is the best place to build a Vietnamese restaurant?
#Seattle, Washington is fairly diverse, and Asians are the second most populous race in the city, making up 15% of the population according to 2019 population estimates.
#The purpose of this project is to investigate and determine a suitable location to build a new Vietnamese restaurant, hopefully to a more diverse customer base.

#The target audience would be to Seattle residents and tourists who enjoy Vietnamese food, and for investors to use to determine if the restaurant is worth funding, and for the business owner to use to determine the location to open.

#To provide the Stakeholders with the necessary information, I will use the U.S.'s 2019 population estimates (The 2021 census data is not published yet) found here: https://www.census.gov/quickfacts/fact/dashboard/seattlecitywashington/PST045219
#There are 18 neighborhoods in Seattle: https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Seattle
#Information about the venues in Seattle will be gathered using the FourSquare API

#I shall generate several maps to illustrate the current distribution of Vietnamese restaurants in Seattle as well as show income and racial demographic data to help determine where the best place to put a Vietnamese restaurant would be


In [None]:
#Importing all the necessary libraries
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import matplotlib.cm as cm
import matplotlib.colors as colors

import json # library to handle JSON files
import requests

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from sklearn.cluster import KMeans

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!wget - q -O 'seattle_zipcodes.json' https://opendata.arcgis.com/datasets/83fc2e72903343aabff6de8cb445b81c_2.geojson

In [None]:
#I could not find a neat table of data that can be easily scraped to gather this info. Therefore, the dataframe has to be created manually. Some zip codes have been omitted, as the websited included zip codes for Burien and Tukilawa, WA
data = { #data retrieved from https://www.zipdatamaps.com/zipcodes-seattle-wa
    'Zip Code': [98101,98102,98103,98104,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98121,98122,98125,98126,98134,98136,98144,98195,98199],
    'Latitude': [47.61143900, 47.63537000,47.67304300,47.60185300,47.66153900,47.54178100,47.66792400,47.54102400,47.63121000,47.63398200,47.68475100,47.57408300,47.68835300,47.54251500,47.63945200,47.61513000,47.61163700,47.71670300,47.54982000,47.57729200,47.53670400,47.58612200, 47.65419500,47.65170700],
    'Longitude': [-122.33404000,-122.32368500,-122.34276600,-122.32770700,-122.28114100,-122.35297900,-122.37812500,-122.31249400,-122.34560200,-122.28824500,-122.28094200,-122.39511800,-122.38101200,-122.26900000,-122.36895400,-122.34675000,-122.29199000,-122.29726700, -122.37406600,-122.33732400,-122.39013300,-122.29278800,-122.30038400,-122.40206600],
    'Median Income': ['$40,466', '$61,467','$68,534','$20,374','$53,688', '$53,174','$59,633','$68,210','$71,268','$89,205','$80,781','$79,020','$77,411','$58,733','$72,039','$44,991','$46,681','$55,618','$63,189','$54,646', '$79,346','$54,642','N/A','$81,257'],
    'Population': ['10,238','20,756','45,911', '13,095','43,924','22,873','21,147', '22,374','20,175','21,077','46,206', '22,241','31,365','42,731','21,039','12,628','31,454','37,081','20,698','644', '14,770', '26,881','0','19,686']
}    

df = pd.DataFrame.from_dict(data, orient='index').transpose()
df


In [None]:
#Mapping the different districts of Seattle and combining it with JSON data to show district boundaries
map_seattle = folium.Map(location=[47.6062,-122.3321],zoom_start=11)
url = 'https://opendata.arcgis.com/datasets/83fc2e72903343aabff6de8cb445b81c_2.geojson'
r = requests.get(url).json()
map_seattle.choropleth(
    geo_data=r,
    fill_color='blue', 
    fill_opacity=0.2, 
    line_opacity=1,
    ) #Applying the JSON overlay to show the boundaries of each zip code
for lat,lng,zc,pop,mi in zip(df['Latitude'],df['Longitude'],df['Zip Code'],df['Population'],df['Median Income']):
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup = "Zip Code: " + str(zc) + "<br />"+ "Population: " + pop + "<br />" + "Median Income: " + mi,
    color='green',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_seattle) #Adding the label showing zip code, population, and median income of the district

map_seattle
#Note that while the following map highlights surrounding zip codes, the areas in Seattle proper have been labelled with a green dot

In [None]:
CLIENT_ID = 'ACI3EHCUKXGHYIET5JH2KYE2ZHQEED20WFRIV2EF1IVCFKV3' 
CLIENT_SECRET = '1VD0L3I3YY0UEC4NKIW44OZTSXF5H1FTB1G5FOLZCBACRGBH' 
VERSION = '20180605' #Retrieving the FourSquare API

In [None]:
radius = 500
LIMIT = 10000
zipcode_latitude = df.loc[0,'Latitude']
zipcode_longitude = df.loc[0,'Longitude']
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    zipcode_latitude, 
    zipcode_longitude, 
    radius, 
    LIMIT)
url

In [None]:
#Generating the attraction dataframe from https://visitseattle.org/things-to-do/sightseeing/top-25-attractions/
attraction_data = { #Some attractions were omitted due it being located outside of Seattle or not having one central location
    'Attraction': ['Space Needle','Seattle Center Monorail','Museum of Pop Culture','Pacific Science Center','Chihuly Garden and Glass','Pacific Northwest Ballet at McCaw Hall','Pike Place Market','Seattle Art Museum','Olympic Sculpture Park','Seattle Central Library','Smith Tower','Seattle Aquarium','Seattle Great Wheel','The Museum of Flight','T Mobile Park','Lumen Field','Kerry Park','Kenmore Air','Museum of History and Industry'],
    'Zip Code': [98109,98109,98109,98109,98109,98109,98101,98101,98101,98104,98104,98101,98101,98108,98134,98134,98119,98109,98109],
    'Latitude': [47.6205,47.617004,47.6215,47.6190,47.6206,47.6240, 47.6097,47.6075,47.6166,47.6067,47.6019,47.6074,47.6061,47.5185,47.5914,47.5952,47.6295,47.6286,47.6275],
    'Longitude': [-122.3493,-122.343506,-122.3481,-122.3516,-122.3504,-122.3503, -122.3422, -122.3380,-122.3553,-122.3325,-122.3319,-122.3430,-122.3425,-122.2969, -122.3325,-122.3316,-122.3599,-122.3398,-122.3366] 
}

attractions = pd.DataFrame.from_dict(attraction_data)
attractions

In [None]:
#Adding the attractions to the existing map
for lat,lng,name in zip(attractions['Latitude'],attractions['Longitude'],attractions['Attraction']):
    folium.CircleMarker(
    [lat,lng],
    radius=4,
    popup = name,
    color ='white',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_seattle)
    
map_seattle

In [None]:
#Creating a function to retrieve the venue list from each district using FourSquare API
def getNearbyVenues(names, latitudes, longitudes, radius=500):
   
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Zip Code', 
                  'Zip Code Latitude', 
                  'Zip Code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
url

In [None]:
seattle_venues = getNearbyVenues(names=df['Zip Code'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

In [None]:
#Creating a function to get a count of the most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
seattle_venue_categories = seattle_venues['Venue Category']
seattle_vietnamese_restaurant = seattle_venues[seattle_venues['Venue Category'] == 'Vietnamese Restaurant']
print("FourSquare found ", seattle_venues['Venue Category'].str.count("Vietnamese Restaurant").sum(), " Vietnamese Restaurants in Seattle")
seattle_vietnamese_restaurant

In [None]:
for latitude,longitude,ven in zip(seattle_vietnamese_restaurant['Venue Latitude'], seattle_vietnamese_restaurant['Venue Longitude'],seattle_vietnamese_restaurant['Venue']):
    label = '{}'.format(ven)
    label = folium.Popup(label, parse_html=True)
    folium.features.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup = label,
        color='pink',
        fill = True,
        fill_color='pink',
        fill_opacity=0.8
    ).add_to(map_seattle)
    
map_seattle #Displaying all the Vietnamese restaurants in Seattle, represented by the transluscent pink markers

In [None]:
#The eligible districts are: 98101, 98108, 98109, 98119, and 98134
attractions.columns

In [None]:
#Creating a dataframe of the 5 eligible districts. The eligible districts are considered to have at least one item from the attraction list, but does not have any Vietnamese restaurant
common = seattle_venues[seattle_venues['Zip Code'] == 98101]
common = common.append(seattle_venues[seattle_venues['Zip Code']==98108])
common = common.append(seattle_venues[seattle_venues['Zip Code']==98109])
common = common.append(seattle_venues[seattle_venues['Zip Code']==98119])
common = common.append(seattle_venues[seattle_venues['Zip Code']==98134])
common.head()

In [None]:
locexplore = getNearbyVenues(names=common['Zip Code'],
                                   latitudes=common['Zip Code Latitude'],
                                   longitudes=common['Zip Code Longitude']
                                  )

In [None]:
#getting a count of the venues within each eligible district
locexplore= locexplore[locexplore['Venue Category']!="Zip Code"]
common_onehot = pd.get_dummies(locexplore[['Venue Category']], prefix="", prefix_sep="")
common_onehot['Zip Code'] = locexplore['Zip Code']
fixed_columns = [common_onehot.columns[-1]] + list(common_onehot.columns[:-1])
common_onehot = common_onehot[fixed_columns]

placeholder = common_onehot.pop('Zip Code')
common_onehot.insert(0, 'Zip Code', placeholder)

eligible_grouped = common_onehot.groupby('Zip Code').sum().astype(int).reset_index()
eligible_grouped

In [None]:
#retrieving a dataframe of the top 10 most common venues within each district
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Zip Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
loc_venues_sorted = pd.DataFrame(columns=columns)
loc_venues_sorted['Zip Code'] = eligible_grouped['Zip Code']

for ind in np.arange(eligible_grouped.shape[0]):
    loc_venues_sorted.iloc[ind, 1:] = return_most_common_venues(eligible_grouped.iloc[ind, :], num_top_venues)

loc_venues_sorted

In [None]:
#While K-means clustering proved to be unuseful in analysis, it can be used to create labels for the eligible districts on a map
kclusters = 5

eligible_cluster = eligible_grouped.drop('Zip Code', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(eligible_cluster)
kmeans.labels_[0:8] 

In [None]:
loc_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

seattle_merged = common

seattle_merged = seattle_merged.join(loc_venues_sorted.set_index('Zip Code'), on = 'Zip Code')

seattle_merged = seattle_merged.drop(columns = ['Venue','Venue Latitude','Venue Longitude','Venue Category'])

In [None]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(seattle_merged['Zip Code Latitude'], seattle_merged['Zip Code Longitude'], seattle_merged['Zip Code'], seattle_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius= 10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_seattle)
       
map_seattle

In [None]:
#Unfortunately, K-means clustering analyses do not reveal anything useful

In [None]:
#Adding final marker to indicate the best location to place a Vietnamese Restaurant
folium.Marker(
        [47.541, -122.312],
        ).add_to(map_seattle)
map_seattle