# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='7ee2869b-ba45-4d72-ad78-ee37edd4f505', project_access_token='p-70a5843214c3e9f24e5b944614fb7338f72e6186')
pc = project.project_context


## Clustering Toronto boroughs

Firstly, Folium is installed and libs are imported

In [2]:
!conda install -c conda-forge folium --yes

Fetching package metadata .............
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /opt/conda/envs/DSX-Python35:
#
folium                    0.7.0                      py_0    conda-forge


In [3]:
import pandas as pd
import numpy as np

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


Then, the csv file conatining the postal code, boroughs, neighborhoods and its coordinates is imported and a dataframe is obtained

In [4]:
file = project.get_file("TorontoPostalWithLatlong.csv")
df_postal = pd.read_csv(file)
df_postal.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Now I will use four square to analyze the boroughs and its venues

In [5]:
# The code was removed by Watson Studio for sharing.

Now I will use the function from the labs to get the nearby venues for every borough

In [6]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
       # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [7]:
venues = getNearbyVenues(df_postal['Neighborhood'],df_postal['Latitude'],df_postal['Longitude'],500)
venues.head(40)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant
5,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Enterprise Rent-A-Car,43.764076,-79.193406,Rental Car Location
6,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Woburn Medical Centre,43.766631,-79.192286,Medical Center
7,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Eggsmart,43.7678,-79.190466,Breakfast Spot
8,Woburn,43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop
9,Woburn,43.770992,-79.216917,Tim Hortons,43.770827,-79.223078,Coffee Shop


Well, I love coffee, so now I will get only the coffee shops on my venues dataframe

In [8]:
venuesCoffee = venues.loc[venues['Venue Category'] == 'Coffee Shop']
coffeeFreq = venuesCoffee.groupby(['Neighborhood','Neighborhood Latitude','Neighborhood Longitude'])['Venue Category'].agg({'Coffee shops':'count'})
coffeeFreq.reset_index()

is deprecated and will be removed in a future version
  from ipykernel import kernelapp as app


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Coffee shops
0,"Adelaide, King, Richmond",43.650571,-79.384568,1
1,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,1
2,"Alderwood, Long Branch",43.602414,-79.543484,1
3,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259,2
4,"Bedford Park, Lawrence Manor East",43.733283,-79.41975,2
5,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,2
6,"Cabbagetown, St. James Town",43.667967,-79.367675,2
7,Canada Post Gateway Processing Centre,43.636966,-79.615819,2
8,Central Bay Street,43.657952,-79.387383,7
9,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,1


Now I will count how many coffee shops are in each neighborhood

## Now I want to cluster the neighborhoods by the amount of coffee shops

I want to know the neighborhoods in the map where there are coffee shops and how many there are per each neighborhood

Now the K means algorithm is used to cluster the dataframe entries. I opted to use every entry from the table and not just the ones that have 'Toronto' in the borough field.
It will be used four clusters in the algorithm.

In [9]:
# set number of clusters
kclusters = 3

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(coffeeFreq[['Coffee shops']])

# check cluster labels generated for each row in the dataframe
#kmeans.labels_[0:10]

coffeeFreq['Cluster ID'] = kmeans.labels_
coffeeFreq = coffeeFreq.reset_index()
coffeeFreq.head(30)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Coffee shops,Cluster ID
0,"Adelaide, King, Richmond",43.650571,-79.384568,1,0
1,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,1,0
2,"Alderwood, Long Branch",43.602414,-79.543484,1,0
3,"Bathurst Manor, Downsview North, Wilson Heights",43.754328,-79.442259,2,2
4,"Bedford Park, Lawrence Manor East",43.733283,-79.41975,2,2
5,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,2,2
6,"Cabbagetown, St. James Town",43.667967,-79.367675,2,2
7,Canada Post Gateway Processing Centre,43.636966,-79.615819,2,2
8,Central Bay Street,43.657952,-79.387383,7,1
9,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,1,0


Now the map is created, with every borough highlighted and assigned with a determined cluster based on the amount of coffee shops

In [10]:
latitude = coffeeFreq['Neighborhood Latitude'][0]
longitude = coffeeFreq['Neighborhood Longitude'][0]

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, qty in zip(coffeeFreq['Neighborhood Latitude'], coffeeFreq['Neighborhood Longitude'], coffeeFreq['Neighborhood'], coffeeFreq['Cluster ID'], coffeeFreq['Coffee shops']):
    label = folium.Popup(str(poi) + ' - ' + str(qty) + " coffee shops", parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The coffee shops are clustered according to the amount of coffee shops. Clicking in the neighborhood mark, it is possible to see how many coffee shops there are in there.
The red marks have 1 coffee shop in the neighborhood, the green marks have 2 or 3 coffee shops and the purple marks have 4 to 7 coffee shops