# Segmenting and Clustering Neighborhoods in Toronto
## An assignment for the Data Science Capstone (Assignment for Week3)

In [464]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import folium
from geopy.geocoders import Nominatim
import requests
import json
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from IPython.core.display import HTML

In [465]:
# Load data from wiki page into dataframe with 3 columns
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### -------- Cleaning --------

In [466]:
# Drop the rows where there isn't an assigned borough.
num_rows_before = df.shape[0]
df.dropna(axis = 0, inplace = True) # drop values that are not available
indeces = df[df['Borough'] == 'Not assigned'].index
df.drop(indeces, inplace=True) # drop rows that have non assigned borough
df.reset_index(inplace = True, drop=True) # 
num_rows_after = df.shape[0]
print(str(num_rows_before - num_rows_after) + ' rows were dropped')
print('Now there are ' + str(num_rows_after) + ' rows.')

77 rows were dropped
Now there are 103 rows.


In [467]:
# Append rows when there is a list of neighbourhoods to be splitted
col_neigh = df['Neighborhood']
for count, rr in enumerate(col_neigh):
    splitted = rr.split(',')
    #print(len(splitted))
    appendthis = pd.DataFrame()
    if(len(splitted) > 1):
        for app in np.arange(1, len(splitted)):
            appendthis['Postal Code'] = [df['Postal Code'][count]]
            appendthis['Borough'] = [df['Borough'][count]]
            appendthis['Neighborhood'] = [splitted[app]]
            df = df.append(appendthis, ignore_index = True)

        df['Neighborhood'][count] = str.strip(splitted[0])
        
print('Now there are ' + str(df.shape[0]) + ' rows. A row has been added for each comma separated neighbour.')

Now there are 217 rows. A row has been added for each comma separated neighbour.


### -------- Adding Lat and Lon Information --------

In [468]:
# Load the lat and lon values for each postal code
geodata = pd.read_csv('http://cocl.us/Geospatial_data')
# Merge the lat and lon onto df's postal code
df = df.merge(geodata, how = 'left', left_on='Postal Code', right_on = 'Postal Code')
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
3,M6A,North York,Lawrence Manor,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


### -------- Get Venues for each Neighbourhood  --------
The below is inspired from the ungraded assignment with NYC as an example

In [469]:
address = 'Toronto'
geolocator = Nominatim(user_agent='foursquare_agent')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [470]:
# Foursquare Credentials
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180928' # Foursquare API version

In [471]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    df['Latitude'][0], 
    df['Longitude'][0], 
    500, 
    100)

results = requests.get(url).json()#["response"]['groups'][0]#['items']
#results

In [472]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        temp = requests.get(url).json()["response"]
        if (temp['totalResults'] > 0):
            results = temp['groups'][0]['items']
        
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng,
                v['venue']['id'],
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        else:
            venues_list.append([(
                name, 
                lat, 
                lng,
                'None',
                'None', 
                'None', 
                'None',  
                'None') for v in results])
            
                

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude',
                  'Venue id',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [473]:
# Get venues using Foursquare
radius = 100
LIMIT = 50
toronto_venues = getNearbyVenues(names=df['Neighborhood'], latitudes=df['Latitude'], longitudes=df['Longitude'])

Parkwoods
Victoria Village
Regent Park
Lawrence Manor
Queen's Park
Islington Avenue
Malvern
Don Mills
Parkview Hill
Garden District
Glencairn
West Deane Park
Rouge Hill
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate
Guildwood
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor
Thorncliffe Park
Richmond
Dufferin
Scarborough Village
Fairview
Northwood Park
East Toronto
Harbourfront East
Little Portugal
Kennedy Park
Bayview Village
Downsview
The Danforth West
Toronto Dominion Centre
Brockton
Golden Mile
York Mills
Downsview
India Bazaar
Commerce Court
North Park
Humber Summit
Cliffside
Willowdale
Downsview
Studio District
Bedford Park
Del Ray
Humberlea
Birch Cliff
Willowdale
Downsview
Lawrence Park
Roselawn
Runnymede
Weston
Dorset Park
York Mills West
Davisville North
Forest Hill North & West
High Park
Westmount
Wexford
Willowdale
North Toronto West
The Annex
Parkdale
Canada Post Gateway P

In [474]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))
# One-hot encode the venue categories
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
toronto_onehot['Venue id'] = toronto_venues['Venue id'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

There are 257 unique categories.


In [475]:
# Group the one-hot encoded feature matrix.
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bathurst Quay,0.0,0.0625,0.0625,0.0625,0.125,0.1875,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Beaumond Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### -------- Kmeans  --------

Clustering neighborhoods into 5 classes depending on the venues as in the NYC example. 

In [476]:
kclusters = 5
features = toronto_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(features)

In [477]:
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
df = toronto_grouped[['Neighborhood', 'Cluster Labels']].merge(df, how = 'left', left_on='Neighborhood', right_on = 'Neighborhood')

### To make sense of the clusters

1. Extracted main categories from the Foursquare base such as Food, Shop&Service, Travel & Transport etc.
2. Assigned the venues one of each of the main categories
3. Assigned a different shade of blue to each category
4. Plotted it onto the map along side clustered neighbourhoods

In [478]:
# Get primary categories defined in Foursquare to assign each venue to it.
url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION)

results = requests.get(url).json()["response"]#['groups'][0]#['items']
#print(results["response"]['totalResults'])
out = pd.json_normalize(results)
out2 = pd.DataFrame.from_dict(out.iloc[0][0])
out3 = pd.DataFrame.from_dict(out[['categories']].iloc[0][0])
parent_categories = out3[['categories']]#.iloc[0]

expanded_categories_list = []
for count, cat in enumerate(np.arange(0, len(parent_categories))): # There are 10 main categories!
    category = pd.DataFrame.from_dict(parent_categories.iloc[cat][0])#.iloc[0]
    #print(cat, category, out2['name'][cat])
#     single = pd.DataFrame.from_dict(category[0])
#     single

    for single in category['name']:
        expanded_categories_list.append((single, out2['name'][cat]))

expanded_categories = pd.DataFrame(expanded_categories_list)
expanded_categories.rename(columns={0:'Category', 1:'Main Category'}, inplace=True)
expanded_categories.head()


Unnamed: 0,Category,Main Category
0,Amphitheater,Arts & Entertainment
1,Aquarium,Arts & Entertainment
2,Arcade,Arts & Entertainment
3,Art Gallery,Arts & Entertainment
4,Bowling Alley,Arts & Entertainment


In [479]:
# Let's assign each venue in toronto_venues its main category
toronto_venues = toronto_venues.merge(expanded_categories, how = 'left', left_on='Venue Category', right_on = 'Category')
toronto_venues.dropna(axis = 0, inplace = True) # drop values that are not available

In [480]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue id,Venue,Venue Latitude,Venue Longitude,Venue Category,Category,Main Category
0,Parkwoods,43.753259,-79.329656,4e8d9dcdd5fbbbb6b3003c7b,Brookbanks Park,43.752,-79.3321,Park,Park,Outdoors & Recreation
1,Parkwoods,43.753259,-79.329656,4cb11e2075ebb60cd1c4caad,Variety Store,43.752,-79.3331,Food & Drink Shop,Food & Drink Shop,Shop & Service
3,Victoria Village,43.725882,-79.315572,4f3ecce6e4b0587016b6f30d,Portugril,43.7258,-79.3128,Portuguese Restaurant,Portuguese Restaurant,Food
4,Victoria Village,43.725882,-79.315572,4bbe904a85fbb713420d7167,Tim Hortons,43.7255,-79.3131,Coffee Shop,Coffee Shop,Food
5,Victoria Village,43.725882,-79.315572,4d689350b6f46dcb77ee15b2,The Frig,43.7271,-79.3174,French Restaurant,French Restaurant,Food


In [481]:
# encode the categories as numbers so that it can be used to select a color in map below
toronto_venues["Main_Cat"] = toronto_venues["Main Category"].astype('category')
toronto_venues["Main_Cat_Numerical"] = toronto_venues["Main_Cat"].cat.codes
toronto_venues.drop(["Main_Cat"], axis = 1, inplace = True)
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue id,Venue,Venue Latitude,Venue Longitude,Venue Category,Category,Main Category,Main_Cat_Numerical
0,Parkwoods,43.753259,-79.329656,4e8d9dcdd5fbbbb6b3003c7b,Brookbanks Park,43.752,-79.3321,Park,Park,Outdoors & Recreation,4
1,Parkwoods,43.753259,-79.329656,4cb11e2075ebb60cd1c4caad,Variety Store,43.752,-79.3331,Food & Drink Shop,Food & Drink Shop,Shop & Service,6
3,Victoria Village,43.725882,-79.315572,4f3ecce6e4b0587016b6f30d,Portugril,43.7258,-79.3128,Portuguese Restaurant,Portuguese Restaurant,Food,2
4,Victoria Village,43.725882,-79.315572,4bbe904a85fbb713420d7167,Tim Hortons,43.7255,-79.3131,Coffee Shop,Coffee Shop,Food,2
5,Victoria Village,43.725882,-79.315572,4d689350b6f46dcb77ee15b2,The Frig,43.7271,-79.3174,French Restaurant,French Restaurant,Food,2


In [482]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.gray(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers for neighborhoods to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(map_clusters)
    
# set color scheme for the 10 different main venue types
num_categories = len(pd.unique(toronto_venues['Main Category']))
x = np.arange(num_categories)
ys = [i + x + (i*x)**2 for i in range(num_categories)]
colors_array = cm.Blues(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers for neighborhoods to the map
markers_colors = []
for lat, lon, poi, cat in zip(toronto_venues['Venue Latitude'], toronto_venues['Venue Longitude'], toronto_venues['Main Category'], toronto_venues['Main_Cat_Numerical']):
    label = folium.Popup(str(poi) + ' Category ' + str(cat), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cat-1],
        fill=True,
        fill_color=rainbow[cat-1],
        fill_opacity=0.5).add_to(map_clusters)    

       
HTML(map_clusters._repr_html_())

### Observations
1. There are 2 main neighbourhood clusters: Cluster0 which is indicated by white, Cluster1 which is indicated by black
2. Cluster0 seems to be attracted to MORE venues
3. Cluster0 seems to be attrcated to places where there are more shopping&food places.
4. Cluster1 seems to be attracted to the venue class "Outdoors & Recreation".

It may be the case that Kmeans is separating shopping&food&entertainment districts with less-venue-dense districts where there could be more apartments and houses.