<h1>Using location data to find profitable locations to setup Healthy Food Centers in New York city</h1>

### Import necessary Libraries

In [1]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          90 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.49   | 32 KB     | ##

### Define foursquare credentials and version

In [2]:
import json

with open('foursquare_credentials.json') as f:
    data = json.load(f)
    CLIENT_ID = data['CLIENT_ID']          # Foursquare ID
    CLIENT_SECRET = data['CLIENT_SECRET']  # Foursquare Secret

VERSION = '20180604'
LIMIT = 30

### We take the city of New York as our city for analysis

In [3]:
address = 'New York'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

LIMIT = 10000 # limit of number of venues returned by Foursquare API
radius = 32187 # define radius - taking radius of New York as 20 miles (32187 meters)

40.7127281 -74.0060152


In [4]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Get the venues which contains 'Gym' in its name

In [5]:
search_query = 'Gym'

# create URL
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    latitude, 
    longitude, 
    VERSION, 
    search_query, 
    radius, 
    LIMIT)

In [6]:
results = requests.get(url).json()
#results

#### Format the data

In [7]:
# assign relevant part of JSON to venues
gym_venues = results['response']['venues']

# tranform venues into a dataframe
gym_dataframe = json_normalize(gym_venues)
gym_dataframe.head()

# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in gym_dataframe.columns if col.startswith('location.')] + ['id']
gym_dataframe_filtered = gym_dataframe.loc[:, filtered_columns]

# filter the category for each row
gym_dataframe_filtered['categories'] = gym_dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
gym_dataframe_filtered.columns = [column.split('.')[-1] for column in gym_dataframe_filtered.columns]

gym_dataframe_filtered.head(5)

Unnamed: 0,name,categories,address,crossStreet,lat,lng,labeledLatLngs,distance,postalCode,cc,city,state,country,formattedAddress,neighborhood,isFuzzed,isServiceAreaBusiness,id
0,Gym @ Barclay Tower,Gym,10 Barclay St,btwn Church & Broadway,40.71236,-74.009429,"[{'label': 'display', 'lat': 40.71236, 'lng': ...",290,10007,US,New York,NY,United States,"[10 Barclay St (btwn Church & Broadway), New Y...",,,,4c60755a54ac0f47b19db521
1,New York by Gehry Gym,Gym,8 Spruce St.,,40.710655,-74.005709,"[{'label': 'display', 'lat': 40.71065538319971...",232,10038,US,New York,NY,United States,"[8 Spruce St., New York, NY 10038, United States]",,,,4e9b03e59a52edbd658ca490
2,Gym @ Tribeca Tower,Gym,105 Duane St,btwn Broadway & Church,40.715839,-74.006289,"[{'label': 'display', 'lat': 40.71583931369093...",347,10007,US,New York,NY,United States,"[105 Duane St (btwn Broadway & Church), New Yo...",,,,4bd6d61d4e32d13a7b25c380
3,200 Water St Gym,Gym,200 Water St,John street,40.707358,-74.004768,"[{'label': 'display', 'lat': 40.70735832018749...",606,10038,US,New York,NY,United States,"[200 Water St (John street), New York, NY 1003...",,,,4bb9d3e798c7ef3bb2573202
4,The Little Gym of Tribeca,Athletics & Sports,124 Hudson St,Ericsson Pl.,40.720673,-74.008253,"[{'label': 'display', 'lat': 40.72067269870378...",904,10013,US,New York,NY,United States,"[124 Hudson St (Ericsson Pl.), New York, NY 10...",,,,52f27a2b11d2ce2cd8c5d6ca


### Get the venues which contains 'Yoga' in its name

In [8]:
search_query = 'Yoga'

# create URL
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    latitude, 
    longitude, 
    VERSION, 
    search_query, 
    radius, 
    LIMIT)

In [9]:
yoga_results = requests.get(url).json()

#### Format the data

In [10]:
# assign relevant part of JSON to venues
yoga_venues = yoga_results['response']['venues']

# tranform venues into a dataframe
yoga_dataframe = json_normalize(yoga_venues)
yoga_dataframe.head()

# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in yoga_dataframe.columns if col.startswith('location.')] + ['id']
yoga_dataframe_filtered = yoga_dataframe.loc[:, filtered_columns]

# filter the category for each row
yoga_dataframe_filtered['categories'] = yoga_dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
yoga_dataframe_filtered.columns = [column.split('.')[-1] for column in yoga_dataframe_filtered.columns]

yoga_dataframe_filtered.head(1)

Unnamed: 0,name,categories,address,crossStreet,lat,lng,labeledLatLngs,distance,postalCode,cc,city,state,country,formattedAddress,neighborhood,id
0,Kula Yoga Project,Yoga Studio,28 Warren St,Church St,40.714342,-74.008094,"[{'label': 'display', 'lat': 40.71434229015808...",251,10007,US,New York,NY,United States,"[28 Warren St (Church St), New York, NY 10007,...",,4ad79413f964a520610c21e3


### Collate Gym and Yoga venues together and prepare required dataframe

In [12]:
df = gym_dataframe_filtered.append(yoga_dataframe_filtered, ignore_index = True)
df.shape

df_filtered = df[['name', 'address', 'lat', 'lng']]
df_filtered.reset_index(drop=True,inplace=True)
df_filtered.head(2)

Unnamed: 0,name,address,lat,lng
0,Gym @ Barclay Tower,10 Barclay St,40.71236,-74.009429
1,New York by Gehry Gym,8 Spruce St.,40.710655,-74.005709


### Apply k-means clustering to the venues

In [13]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 7

newyork_grouped_clustering = df_filtered.drop(['name', 'address'], axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(newyork_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([6, 6, 6, 6, 6, 6, 6, 6, 6, 3], dtype=int32)

#### Add the clustering labels

In [14]:
# add clustering labels
df_filtered.insert(0, 'Cluster Labels', kmeans.labels_)
df_filtered.head(5)

Unnamed: 0,Cluster Labels,name,address,lat,lng
0,6,Gym @ Barclay Tower,10 Barclay St,40.71236,-74.009429
1,6,New York by Gehry Gym,8 Spruce St.,40.710655,-74.005709
2,6,Gym @ Tribeca Tower,105 Duane St,40.715839,-74.006289
3,6,200 Water St Gym,200 Water St,40.707358,-74.004768
4,6,The Little Gym of Tribeca,124 Hudson St,40.720673,-74.008253


### Putting clusters on map of New York

In [16]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters= folium.Map(location=[latitude,longitude],zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_filtered['lat'], df_filtered['lng'], df_filtered['name'], df_filtered['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### NOTE: Based upon successive runs of the clustering and map generation, the colors of the clusters may vary every time

In [17]:
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=['lat', 'lng'])
centroids

Unnamed: 0,lat,lng
0,40.726148,-73.993372
1,40.708805,-73.957376
2,40.725546,-74.036948
3,40.746637,-73.99456
4,40.766159,-73.962166
5,40.694688,-73.984747
6,40.712741,-74.008928


In [19]:
for lat, lon, cluster in zip(centroids['lat'], centroids['lng'], centroids.index):
    label = folium.Popup('Centroid of Cluster ' + str(cluster) + ' (' + str(lat) + ',' + str(lon) + ')', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=6,
        popup=label,
        color='#000000',
        fill=True,
        fill_color='#191919',
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters