# 1. Introduction

## 1.1 Problem Statement


We are small investors and plan to open a restaurant in Ho Chi Minh City. In order to optimize the selection of location and scale of our restaurant, we shall consider the below criteria:
- Competitiveness: The neighborhood is not so competitive given (i) there are not too many restaurants in the neighborhood or (ii) the restaurants in the neighborhood are not highly rated in Foursquare.

- Neighborhood's population: The bigger population the better as we have a larger pool of potential customers. We do not want to operate our restaurant in suburban areas where there are not too many competitors but a small pool of potential customers as well.

- The restaurant tier (cheap, medium or expensive restaurant): This depends on the neighborhood's preference. We cannot open an highly expensive restaurant in a low-middle neighborhood and vice versa.

## 1.2. Data

Based on the criteria pre-defined above, we need the following data to perform the task:

- List of suggested venues from Foursquare API containing (i) venue's coordinates, (ii) venue's rating, and (iii) venue's tier.

- Population at district level.


Data source:

- Foursquare: Use regular calls to get suggested venues and their location. Use premimum calls to get venue's rating and tier.

- Citipopulation.de to get population at district level.

## 1.3. Methodology

We will use a combination of clustering models and map visualization to do the analyses.

In [1]:
import folium
import json
# import matplotlib.cm as cm
# import matplotlib.colors as colors
# import matlibplot.pyplot as plt
import numpy as np
import pandas as pd
# import seaborn as sns
import requests
from sklearn.cluster import KMeans

pd.set_option('display.max_rows', None)

The section below is copied from https://cocl.us/coursera_capstone_notebook project. Credit to the unknown author.

In [2]:
hcm_center = (10.762622, 106.660172)

In [3]:
import pyproj

import math

def lonlat_to_xy(lon, lat):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=48, datum='WGS84')
    xy = pyproj.transform(proj_latlon, proj_xy, lon, lat)
    return xy[0], xy[1]

def xy_to_lonlat(x, y):
    proj_latlon = pyproj.Proj(proj='latlong',datum='WGS84')
    proj_xy = pyproj.Proj(proj="utm", zone=48, datum='WGS84')
    lonlat = pyproj.transform(proj_xy, proj_latlon, x, y)
    return lonlat[0], lonlat[1]

def calc_xy_distance(x1, y1, x2, y2):
    dx = x2 - x1
    dy = y2 - y1
    return math.sqrt(dx*dx + dy*dy)

In [4]:
hcm_center_x, hcm_center_y = lonlat_to_xy(hcm_center[1], hcm_center[0]) # City center in Cartesian coordinates

k = math.sqrt(3) / 2 # Vertical offset for hexagonal grid cells
x_min = hcm_center_x - 6000
x_step = 600
y_min = hcm_center_y - 6000 - (int(21/k)*k*600 - 12000)/2
y_step = 600 * k 

latitudes = []
longitudes = []
distances_from_center = []
xs = []
ys = []

for i in range(0, int(21/k)):
    y = y_min + i * y_step
    x_offset = 300 if i%2==0 else 0
    for j in range(0, 21):
        x = x_min + j * x_step + x_offset
        distance_from_center = calc_xy_distance(hcm_center_x, hcm_center_y, x, y)
        if (distance_from_center <= 6001):
            lon, lat = xy_to_lonlat(x, y)
            latitudes.append(lat)
            longitudes.append(lon)
            distances_from_center.append(distance_from_center)
            xs.append(x)
            ys.append(y)

In [82]:
map_hcm = folium.Map(location=hcm_center, tiles='CartoDB dark_matter', zoom_start=13)

folium.Marker(hcm_center, popup='HCM').add_to(map_hcm)

for lat, lon in zip(latitudes, longitudes):
    #folium.CircleMarker([lat, lon], radius=2, color='blue', fill=True, fill_color='blue', fill_opacity=1).add_to(map_hcm) 
    folium.Circle([lat, lon], radius=300, color='blue', fill=False).add_to(map_hcm)
    #folium.Marker([lat, lon]).add_to(map_hcm)

map_hcm

In [6]:
def get_exploring_urls(client_id, client_secret, version, latitidues, longitudes, category, radius=600, limit=100, time='any', day='any'):
    urls = [f'https://api.foursquare.com/v2/venues/explore?&client_id={client_id}&client_secret={client_secret}&v={version}&ll={lat},{long}&categoryId={category}&radius={radius}&limit={limit}&time={time}&day={day}' for lat, long in zip(latitudes, longitudes)]

    return urls


def get_venue_details_urls(client_id, client_secret, version, venue_ids):
    urls = [f'https://api.foursquare.com/v2/venues/{venue_id}?client_id={client_id}&client_secret={client_secret}&v={20200101}' for venue_id in venue_ids]

    return urls


def get_venue_tips_urls(client_id, client_secret, version, venue_ids, limit=500):
    urls = [f'https://api.foursquare.com/v2/venues/{venue_id}/tips?client_id={client_id}client_secret={client_secret}&v={version}&limit={limit}' for venue_id in venue_ids]

    return urls


def get_venue_menu_urls(client_id, client_secret, version, venue_ids):
    urls = [f'https://api.foursquare.com/v2/venues/venue_id/menu?client_id={client_id}&client_secret={client_secret}&v={version}' for venue_id in venue_ids]

    return urls


def request_api(urls):
    results = [requests.get(url).json() for url in urls]

    return results


def check_errors(api_results):
    errors = []
    for idx, result in enumerate(api_results):
        if result['meta']['code'] != 200:
            errors[version].append((idx, result))
    
    return errors


def remedy_errors(api_results, urls, errors, tries=5):
    while len(errors) > 0 and tries > 0:
        for error in errors:
            idx = error[0]
            url = urls[idx]
            
            new_result = requests.get(url).json()
            api_results[idx] = new_result
            
        errors = check_errors(api_results)
        tries -= 1
    
    return api_results


def get_venues(api_results):
    venues = []
    for result in api_results:
        if result['meta']['code'] == 200:
            for item in result['response']['groups'][0]['items']:
                venues.append(item['venue'])
    
    return venues


def get_distinct_venues(venues):
    venue_ids = []
    distinct_venues = []
    for venue in venues:
        venue_id = venue['id']
        if venue_id not in venue_ids:
            venue_ids.append(venue_id)
            distinct_venues.append(venue)
    
    return distinct_venues


def get_venue_coordinates(venues):
    coordinates = [(venue['location']['lat'], venue['location']['lng']) for venue in venues]

    return coordinates


def get_venues_tier(venues_details):
    venue_tier = []
    for venue in venues_details:
        try:
            tier = venue['response']['venue']['price']['tier']
        except:
            tier = None

        venue_tier.append(tier)

    return venue_tier


def get_venues_likes(venues_details):
    venue_likes = []
    for venue in venues_details:
        try:
            likes = venue['response']['venue']['likes']['count']
        except:
            likes = None
        
        venue_likes.append(likes)
    
    return venue_likes


def get_venues_rating(venues_details):
    venue_rating = []
    for venue in venues_details:
        try:
            rating = venue['response']['venue']['rating']
        except:
            rating = None
        
        venue_rating.append(rating)
    
    return venue_rating

In [7]:
client_id = 'VBT1KEJNYQMDORW3N55MCKB3S35RZJVEG3I42IN3SSCLMQZO' # your Foursquare ID
client_secret = 'ZW3MU1JZLNMKMIS4NBS1X4VPSMMGXRIW1LLXL5KC0AGEZMXY' # your Foursquare Secret
versions = '20200101' # Foursquare API version

In [66]:
# food_category = '4d4b7105d754a06374d81259'

# urls_food = get_exploring_urls(client_id, client_secret, versions, latitudes, longitudes, food_category)
# results_food = request_api(urls_food)
# errors_food = check_errors(results_food)
# results_food = remedy_errors(results_food, urls_food, errors_food, tries=10)
# print('Errors count:', len(check_errors(results_food)))

In [251]:
# venues_food = get_distinct_venues(get_venues(results_food))

# with open('results_food.json', 'w') as fp:
#     json.dump(results_food, fp)

# with open('venues_food.json', 'w') as fp:
#     json.dump(venues_food, fp)

In [None]:
# nightlife_category = '4d4b7105d754a06376d81259'

# urls_nightlife = get_exploring_urls(client_id, client_secret, versions, latitudes, longitudes, nightlife_category)
# results_nightlife = request_api(urls_nightlife)
# errors_nightlife = check_errors(results_nightlife)
# results_nightlife = remedy_errors(results_nightlife, urls_nightlife, errors_nightlife, tries=10)
# print('Errors count:', len(check_errors(results_nightlife)))

In [252]:
# venues_nightlife = get_distinct_venues(get_venues(results_nightlife))

# with open('results_nightlife.json', 'w') as fp:
#     json.dump(results_nightlife, fp)

# with open('venues_nightlife.json', 'w') as fp:
#     json.dump(venues_nightlife, fp)

In [8]:
# Load json files

with open('results_food.json', 'r') as f:
    results_food = json.load(f)

with open('venues_food.json', 'r') as f:
    venues_food = json.load(f)

with open('venues_details_food.json', 'r') as f:
    venues_details_food = json.load(f)

In [648]:
# start_idx = len(venues_details_food)
# end_idx = start_idx + 500

# temp_venues_id_food = [venue['id'] for venue in venues_food[start_idx:end_idx]]
# temp_venues_details_urls_food = get_venue_details_urls(client_id, client_secret, version, temp_venues_id_food)
# temp_venues_details_food = request_api(temp_venues_details_urls_food)
# venues_details_food.extend(temp_venues_details_food)

# with open('venues_details_food.json', 'w') as fp:
#     json.dump(venues_details_food, fp)

In [705]:
# for idx, venue in enumerate(venues_details_food):
#     if venue['meta']['code'] != 200:
#         venue_id = venues_food[idx]['id']
#         temp_url = get_venue_details_urls(client_id, client_secret, version, [venue_id])
#         temp_api_result = request_api(temp_url[0])
#         venue = temp_api_result[0]

In [11]:
from folium.plugins import MarkerCluster

venues_coor_food = get_venue_coordinates(venues_food)
coordinates = venues_coor_food.copy()

map_hcm = folium.Map(location=hcm_center,  tiles='CartoDB dark_matter', zoom_start=12)

marker_cluster = MarkerCluster().add_to(map_hcm)

for lat_long in coordinates:
    #folium.CircleMarker([lat, lon], radius=2, color='blue', fill=True, fill_color='blue', fill_opacity=1).add_to(map_hcm) 
    # folium.Circle([lat, lon], radius=300, color='blue', fill=False).add_to(map_hcm)
    folium.Marker(lat_long).add_to(marker_cluster)

map_hcm

In [12]:
venues_tier = get_venues_tier(venues_details_food)
venues_likes = get_venues_likes(venues_details_food)
venues_rating = get_venues_rating(venues_details_food)

In [13]:
venues_id_food = [venue['id'] for venue in venues_food]

venue_coords_food = get_venue_coordinates(venues_food)
transposed_venue_coords_food = np.array(venue_coords_food).transpose()
lats = transposed_venue_coords_food[0]
longs = transposed_venue_coords_food[1]
x, y = lonlat_to_xy(longs[:967], lats[:967])

venues_df = pd.DataFrame({
    'venue_id': venues_id_food[:967],
    'latitude': lats[:967],
    'longitude': longs[:967],
    'x': x,
    'y': y,
    'tier': venues_tier,
    'likes': venues_likes,
    'rating': venues_rating
})
venues_df.set_index('venue_id', inplace=True)
venues_df.describe()

Unnamed: 0,latitude,longitude,x,y,tier,likes,rating
count,967.0,967.0,967.0,967.0,917.0,964.0,482.0
mean,10.755178,106.674134,683062.519029,1189409.0,1.635769,9.637967,6.821162
std,0.014433,0.027073,2959.167782,1600.158,0.630681,22.827373,0.735305
min,10.709257,106.604434,675429.144665,1184310.0,1.0,0.0,5.2
25%,10.744547,106.655931,681081.461884,1188216.0,1.0,1.0,6.2
50%,10.758977,106.681192,683841.640981,1189835.0,2.0,4.0,6.8
75%,10.767364,106.694843,685320.795592,1190759.0,2.0,9.0,7.3
max,10.77612,106.715883,687631.440877,1191687.0,4.0,376.0,8.7


In [14]:
X = venues_df[['x', 'y', 'tier', 'likes', 'rating']]
X.dropna(inplace=True)
X.describe()

Unnamed: 0,x,y,tier,likes,rating
count,458.0,458.0,458.0,458.0,458.0
mean,684495.510505,1190100.0,1.689956,18.144105,6.809607
std,1760.267409,1302.629,0.637922,30.181453,0.736331
min,676327.034915,1186546.0,1.0,2.0,5.2
25%,683788.077658,1189688.0,1.0,6.0,6.2
50%,684839.22483,1190605.0,2.0,10.0,6.8
75%,685781.353972,1191021.0,2.0,18.75,7.3
max,687285.104009,1191336.0,4.0,376.0,8.7


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X

array([[ 0.33916722, -2.70640178,  0.48655252, -0.53548642, -2.18837353],
       [ 0.5919823 , -2.70827797, -1.08275067, -0.43597894, -0.69284642],
       [-0.91401902, -2.59166323,  0.48655252,  0.29374262,  0.12289564],
       ...,
       [ 1.07954859,  0.87828517,  0.48655252, -0.33647145, -1.50858848],
       [ 1.02405546,  0.93435972,  0.48655252, -0.4691481 , -1.37263147],
       [ 1.06649321,  0.90104961,  0.48655252, -0.43597894, -1.64454549]])

In [60]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.3, min_samples=4).fit(scaled_X)
X['db_labels'] = db.labels_
X.groupby('db_labels')[['x', 'y', 'tier', 'likes', 'rating']].describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y,tier,likes,rating
db_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-1,count,392.0,392.0,392.0,392.0,392.0
-1,mean,684392.314506,1190003.0,1.681122,19.910714,6.845918
-1,std,1863.538736,1344.365,0.665443,32.26742,0.747978
-1,min,676327.034915,1186546.0,1.0,2.0,5.2
-1,25%,683486.008848,1189498.0,1.0,7.0,6.3
-1,50%,684742.429024,1190488.0,2.0,11.0,6.8
-1,75%,685849.629752,1190998.0,2.0,21.0,7.3
-1,max,687285.104009,1191336.0,4.0,376.0,8.7
0,count,4.0,4.0,4.0,4.0,4.0
0,mean,686057.911358,1187789.0,2.0,6.75,6.15


In [62]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=6, random_state=0).fit(scaled_X)
X['kmeans_labels'] = kmeans.labels_
X.groupby('kmeans_labels')[['x', 'y', 'tier', 'likes', 'rating']].describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,x,y,tier,likes,rating
kmeans_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,count,92.0,92.0,92.0,92.0,92.0
0,mean,684825.800823,1190669.0,1.0,14.163043,6.468478
0,std,918.622056,523.8263,0.0,13.987653,0.525593
0,min,682706.517526,1189381.0,1.0,2.0,5.2
0,25%,684125.097934,1190436.0,1.0,6.0,6.1
0,50%,684672.014708,1190777.0,1.0,9.0,6.4
0,75%,685392.341016,1191113.0,1.0,16.5,6.925
0,max,686710.785568,1191311.0,1.0,81.0,7.5
1,count,117.0,117.0,117.0,117.0,117.0
1,mean,684871.905555,1190752.0,1.897436,28.247863,7.607692


In [20]:
joined_X = X.join(venues_df[['latitude', 'longitude']], how='left')

In [48]:
# from folium.plugins import BeautifyIcon

coordinates = joined_X[['latitude', 'longitude']].values.copy()
labels = joined_X['db_labels'].copy()

map_hcm = folium.Map(location=hcm_center,  tiles='CartoDB dark_matter', zoom_start=14)

# marker_point = BeautifyIcon(iconShape='circle-dot').add_to(map_hcm)

# for lat_long in coordinates:
#     folium.CircleMarker([lat, lon], radius=2, fill=True, fill_color='blue', fill_opacity=1).add_to(map_hcm)
#     # folium.Circle([lat, lon], radius=300, color='blue', fill=False).add_to(map_hcm)
#     # folium.Marker(lat_long).add_to(marker_point)

# map_hcm

clustered_venues = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for idx, (lat, lng) in enumerate(coordinates):
    if labels[idx] == 2:
        color = 'orange'
    else:
        color = 'grey'

    clustered_venues.add_child(
        folium.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color=None,
            fill=True,
            fill_color=color,
            fill_opacity=0.6
        )
    )

# add incidents to map
map_hcm.add_child(clustered_venues)

In [63]:
coordinates = joined_X[['latitude', 'longitude']].values.copy()
labels = joined_X['kmeans_labels'].copy()
colors_set = {
    0: 'grey',
    1: 'blue',
    2: 'grey',
    3: 'grey',
    4: 'grey',
    5: 'grey',
    6: 'grey',
    7: 'grey',
}

map_hcm = folium.Map(location=hcm_center,  tiles='CartoDB dark_matter', zoom_start=14)

# marker_point = BeautifyIcon(iconShape='circle-dot').add_to(map_hcm)

# for lat_long in coordinates:
#     folium.CircleMarker([lat, lon], radius=2, fill=True, fill_color='blue', fill_opacity=1).add_to(map_hcm)
#     # folium.Circle([lat, lon], radius=300, color='blue', fill=False).add_to(map_hcm)
#     # folium.Marker(lat_long).add_to(marker_point)

# map_hcm

clustered_venues = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for idx, (lat, lng) in enumerate(coordinates):
    clustered_venues.add_child(
        folium.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color=None,
            fill=True,
            fill_color=colors_set[labels[idx]],
            fill_opacity=0.6
        )
    )

# add incidents to map
map_hcm.add_child(clustered_venues)

In [80]:
coordinates = joined_X[['latitude', 'longitude']].values.copy()
rating = joined_X['rating'].copy()

map_hcm = folium.Map(location=hcm_center,  tiles='CartoDB dark_matter', zoom_start=14)

# marker_point = BeautifyIcon(iconShape='circle-dot').add_to(map_hcm)

# for lat_long in coordinates:
#     folium.CircleMarker([lat, lon], radius=2, fill=True, fill_color='blue', fill_opacity=1).add_to(map_hcm)
#     # folium.Circle([lat, lon], radius=300, color='blue', fill=False).add_to(map_hcm)
#     # folium.Marker(lat_long).add_to(marker_point)

# map_hcm

folium.Circle(hcm_center, tooltip='radius 6,000', radius=6000, color='blue', fill=False).add_to(map_hcm)
folium.Marker(hcm_center, tooltip='City Centroid').add_to(map_hcm)
clustered_venues = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for idx, (lat, lng) in enumerate(coordinates):
    if rating[idx] >= 8:
        color = 'darkgreen'
    elif rating[idx] >= 7.5:
        color = 'green'
    elif rating[idx] >= 7:
        color = 'lightgreen'
    else:
        color = 'grey'

    clustered_venues.add_child(
        folium.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color=None,
            fill=True,
            fill_color=color,
            fill_opacity=0.6
        )
    )

# add incidents to map
map_hcm.add_child(clustered_venues)