Import required packages

In [4]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
from geopy import distance
import folium
import requests
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

Import Foursquare credentials from json file created to preserve secrecy and store in global variables CLIENT_ID and CLIENT_SECRET

In [5]:
import json
globals().update(json.loads(open("foursquareCredentials.json", "r") .read()))
VERSION = '20181124' # Foursquare API version

## Input the variables that define the problem
Get longitude and latitude of location where we want to open a new business

In [6]:
geolocator = Nominatim(user_agent="specify_your_app_name_here")
location = geolocator.geocode("Rumbach Sebestyén u., Budapest, 1075 Hungary")
#location = geolocator.geocode("Galley Ave, Toronto")
#location = geolocator.geocode("st dunstans st, canterbury uk")
lat = location.latitude
long = location.longitude

Populate variables containing the other parameters of the search problem:
* categoryId = foursquare categoryId of the business we wish to open. A full list of categories can be found here: https://developer.foursquare.com/docs/resources/categories.
* radius = the maximum distance from the location within which we want to open the business in km.
* tick = the level of accuracy in km which we want to return for the recommended locations. 

In [7]:
categoryId = '4bf58dd8d48988d16d941735' # Café
radius = 2
tick = 0.2

## Build function which pulls venue data from Foursquare
Define function for pulling list of nearby venues, with optional filter on category Id

In [19]:
url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&intent=browse&ll={},{}&radius={}&categoryId={}'
apiCall = url.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, long, radius, categoryId)
venues = requests.get(apiCall).json()['response']['venues']
json.loads(venues)

TypeError: the JSON object must be str, bytes or bytearray, not 'list'

In [8]:
url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&intent=browse&ll={},{}&radius={}&categoryId={}'
def getNearbyVenues(lat, long, radius, categoryId = ''):
    apiCall = url.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, long, radius, categoryId)
    venues = requests.get(apiCall).json()['response']['venues']
    venues_list=[]
    
    for v in venues:
        if len(v['categories']) > 0:
            venues_list.append([v['id'], v['name'], v['location']['distance'], v['categories'][0]['name']])
        else:
            venues_list.append([v['id'], v['name'], v['location']['distance'], 'None'])
    
    return(venues_list)

Check function getNearbyVenues is working

In [9]:
getNearbyVenues(lat, long, tick * 1000, categoryId)

[['510c25ede4b0a3ee4308e94a', 'Központ', 188, 'Café'],
 ['4c9ce18c542b224bbaade49f', 'Drum Cafe', 100, 'Café'],
 ['52054dd511d2778ebd4f1cf3', 'Blue Bird Cafe', 171, 'Café'],
 ['514d9e95e4b01bb2769654a3', 'Solinfo', 67, 'Café'],
 ['4c8e63f5a8de224be0182701', 'Café Vian', 182, 'French Restaurant'],
 ['5b9a6c1d1ffe97002c3349c7', 'Fresh Corner Downtown', 187, 'Café'],
 ['54535798498ef0cd9c592e6e', 'Blue Bird Roastery', 137, 'Café'],
 ['4b804281f964a520d36130e3', 'Spinoza Café', 87, 'Restaurant'],
 ['4b99441cf964a520be6e35e3', 'Printa Café', 111, 'Art Gallery'],
 ['5152e6f0e4b02b6799b2ef78', 'Tom Dixon Cafè & Showroom', 62, 'Café'],
 ['5bda0a2bb3c961002badd1c6', 'Kárpit Café', 128, 'Café'],
 ['57c41471498e814e459823ff', 'Blue Bird Cafe', 130, 'Café'],
 ['4c20cfe93573c9b602ea3515', 'Café Zenit', 148, 'Pub'],
 ['52495a1e11d224c2f94c5759', 'Alexandra Irodalmi Kávéház', 101, 'Café'],
 ['52b198d6498e62e5200cc830', 'Hinta Kávézó', 129, 'Café'],
 ['4e08828bd22d658532ac1d6b', 'Random Point', 187, '

## Build function to calculate density of venues within a specified radius of a location
Define function venueDensity which outputs a measure the density of a list of venues within a specific radius of a location.
The density measure has the property of being larger the more venues there are in the list and the closer they are.
So for each venue we subtract the distance of the venue from the location from the radius.
This measure is then summed up for all the venues

Worked example, radius = 200m, 2 venues, 1 distance 1m from the location, the other distance 199m:
* venue 1 density = 200 - 1 = 199
* venue 2 density = 200 - 199 = 1
* total density = 199 + 1 = 200

In [1]:
def venueDensity(venues, radius):
    distances = []
    [distances.append(radius - v[2]) for v in venues if v[2] < radius]
    return np.sum(np.asarray(distances))

In [2]:
venueDensity(getNearbyVenues(lat, long, tick * 1000, categoryId), tick * 1000)

NameError: name 'getNearbyVenues' is not defined

In [3]:
# Check if density for empty list of venues = 0
venueDensity([], tick * 1000)

NameError: name 'tick' is not defined

## Get venue density for each point on the grid around location

In [None]:
targets = []
ids = set()
for (lt, lg) in g:
    print((lt, lg))
    venues = getNearbyVenues(lt, lg, tick * 1000, categoryId)
    density = venueDensity(venues, tick * 1000)
    targets.append((lt, lg, density))
    for (v_id, name, distance, category) in venues:
        ids.add(v_id)

In [None]:
targetsDf = pd.DataFrame(targets)
targetsDf.columns = ['Latitude', 'Longitude', 'Density']
targetsDf.set_index(['Latitude', 'Longitude'], inplace=True)
targetsDf.sort_index(inplace=True)
targetsDf

In [None]:
targetsDf.describe()

In [None]:
targetsDf.plot(y='Density', kind='box')
plt.ylabel('Density')
plt.show()

In [None]:
targetsDf['Log Density'] = np.log10(targetsDf['Density'].values + 1)

In [None]:
targetsDf.describe()

In [None]:
targetsDf.plot(y='Log Density', kind='box')
plt.ylabel('Log Density')
plt.show()

In [None]:
ids

In [None]:
otherVenues = []
for (lt, lg) in g:
    print((lt, lg))
    venues = getNearbyVenues(lt, lg, tick * 1000)
    for (v_id, name, distance, category) in venues:
        if not (v_id in ids) and category != 'None' and distance < tick * 1000 :
            otherVenues.append((lt, lg, v_id, name, distance, category))

In [None]:
otherVenuesDf = pd.DataFrame(otherVenues)
otherVenuesDf.columns = ['Latitude', 'Longitude', 'VenueID', 'Name', 'Distance', 'Category']
otherVenuesDf.set_index(['Latitude', 'Longitude'], inplace=True)
otherVenuesDf.sort_index(inplace=True)
otherVenuesDf

In [None]:
otherVenuesDf['Density'] = (tick * 1000) - otherVenuesDf['Distance']
otherVenuesDf

In [None]:
features = pd.pivot_table(otherVenuesDf,index=['Latitude', 'Longitude'],values='Density',
                          columns='Category',aggfunc=[np.sum],fill_value=0)['sum']

In [None]:
features.info()

In [None]:
features

In [None]:
features.describe()

In [None]:
features = np.log10(features + 1)

In [None]:
features.describe()

In [None]:
features = features.join(targetsDf).fillna(0).reset_index()

In [None]:
features.describe()

In [None]:
X = features.drop(['Latitude', 'Longitude', 'Density', 'Log Density'], axis=1)
y = features['Log Density']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=22)

In [None]:
reg_all = linear_model.LinearRegression()
reg_all.fit(X_train, y_train)
y_pred_test = reg_all.predict(X_test)
y_pred_train = reg_all.predict(X_train)
reg_all.score(X_test, y_test)

In [None]:
testDf = y_test.to_frame()
testDf['Predicted'] = y_pred_test
testDf.plot(kind='box')
plt.show()

In [None]:
testDf.describe()

In [None]:
plt.scatter(y_test, y_pred_test, color='blue', label='Test')
plt.scatter(y_train, y_pred_train, color='red', label='Training')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.legend(loc='upper left')
plt.show()

In [None]:
lasso = linear_model.Lasso(alpha=0.005, normalize=True)
lasso.fit(X_train, y_train)
lasso_pred_test = lasso.predict(X_test)
lasso_pred_train = lasso.predict(X_train)
lasso.score(X_test, y_test)

In [None]:
testDf = y_test.to_frame()
testDf['Predicted'] = lasso_pred_test
testDf.plot(kind='box')
plt.show()

In [None]:
plt.scatter(y_test, lasso_pred_test, color='blue', label='Test')
plt.scatter(y_train, lasso_pred_train, color='red', label='Training')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.legend(loc='upper left')
plt.show()

In [None]:
lasso_pred_all = lasso.predict(X)

In [None]:
plt.scatter(y, lasso_pred_all, color='green')
plt.show()

In [None]:
len(lasso_pred_all)

In [None]:
results = features[['Latitude', 'Longitude', 'Log Density']]

In [None]:
results['Predicted'] = lasso_pred_all

In [None]:
plt.scatter(results['Log Density'], results['Predicted'], color='green')
plt.show()

In [None]:
results['Rank'] = results['Predicted'] / (results['Log Density'] + 1)

In [None]:
top10 = results.sort_values(by='Rank', ascending=False).head(10)
top10

In [None]:
l = list(zip(top10['Latitude'], top10['Longitude']))
l

In [None]:
# create map
map_grid = folium.Map(location=[lat, long], zoom_start=13)

for (lt, lg) in l:
    folium.CircleMarker([lt, lg],
                        radius=4,
                        fill=True,
                        fill_opacity=0.5,
                        color='Red').add_to(map_grid)
       
map_grid