# Segmenting and Clustering Neighborhoods in Toronto 3

First map all the neighborhoods in Toronto

In [3]:
from sklearn.cluster import KMeans
import folium
import pandas
import requests

postCodes_new = pandas.read_csv("postCodes_new.csv")

map_toronto = folium.Map(location=[postCodes_new["Latitude"].mean(), postCodes_new["Longitude"].mean()], zoom_start=11)

for lat, lng, borough, neighborhood in zip(postCodes_new['Latitude'], postCodes_new['Longitude'], postCodes_new['Borough'], postCodes_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto

Now get 100 venues in the range of 1000 meters around each neighborhood centre

In [4]:
CLIENT_ID = '2PEB55DZZ0P3QX45ROLHSAVQEQYTR03FC5A1RTSN5Y2G41IH' # your Foursquare ID
CLIENT_SECRET = 'CP3XXKJGO0DXCBDDUQVCOA4WRHW335NVQKITTBSZ2KHYKYXK' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100

from pandas.io.json import json_normalize

def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()
        
        try:
            results = results["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except:
            print("groups not found")

    nearby_venues = pandas.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=postCodes_new['Neighborhood'],
                                   latitudes=postCodes_new['Latitude'],
                                   longitudes=postCodes_new['Longitude']
                                  )

Now turn the venues into a one-hot-encoding of their categories and calculate the mean for each neighborhood

In [5]:
toronto_onehot = pandas.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

Cluster the average count of venues with kmeans and k=5

In [12]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=3).fit(toronto_grouped_clustering)

prediction = pandas.DataFrame(columns=('Neighborhood', 'Cluster Labels'))
prediction['Neighborhood'] = toronto_grouped['Neighborhood']
prediction['Cluster Labels'] = kmeans.labels_

postCodesWithLabels = postCodes_new.join(prediction.set_index('Neighborhood'), on='Neighborhood')
postCodesWithLabels['Cluster Labels'] = postCodesWithLabels['Cluster Labels'].fillna(-1).astype("int32")
postCodesWithLabels.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,4
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0


Make a folium plot of the clustered neighborhoods

In [13]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[postCodes_new["Latitude"].mean(), postCodes_new["Longitude"].mean()], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(postCodesWithLabels['Latitude'], postCodesWithLabels['Longitude'], postCodesWithLabels['Neighborhood'], postCodesWithLabels['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

One can see that there is a distinct cluster of neighborhoods around the city centre