# Assignment Week 3 - Clustering Toronto Neighborhoods

# Part I: Scraping Postcode Information from Wikipedia

Importing Libraries

In [344]:
import pandas as pd
import numpy as np
import requests

#!pip install bs4
from bs4 import BeautifulSoup

I use BeautifulSoup to get the html code of the needed page

In [347]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

I actually wrote some code on this by myself, but as JupyterLab crashed and my Notebook was deleted, I chose to use the provided code and extended it a bit:

In [348]:
table_contents=[]
table = soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text == 'Not assigned': ##Skipping all Cells that have no Borough assigned
        pass
    else: 
        cell['PostalCode'] = row.p.text[:3] #Selecting the PostCode String
        cell['Borough'] = (row.span.text).split('(')[0] #Selecting the Borough String
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ') #Cleaning the Neighborhood String
        table_contents.append(cell)

Forming this to a Dataframe:

In [349]:
toronto_hoods = pd.DataFrame(table_contents, columns = ['PostalCode', 'Borough', 'Neighborhood'])

In [350]:
toronto_hoods.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Printing the number of rows of the Dataframe:

In [357]:
print('Number of rows: '+ str(toronto_hoods.shape[0]))

Number of rows: 103


<br>
<br>
<br>

# Part II: Retrieving Borough Coordinates


Importing libraries

In [17]:
#!pip install geocoder
import geocoder

I tried to geocode with the following code, however it ran forever without finishing. I therefore chose to read in the csv file provided.

In [None]:
lat_lng_coords = None
latitude = []
longitude = []

for postcode in toronto_hoods['PostalCode']:
    
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
        lat_lng_coords = g.latlng

    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1])

Here I read in the csv file with the help of wget:

In [12]:
!wget -q -O "toronto_postcodes.csv" https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
postcodes = pd.read_csv("toronto_postcodes.csv")

Merging the Hoods Dataframe with the postcodes dataframe:

In [13]:
toronto_merged = toronto_hoods.merge(postcodes, how = "left", left_on = "PostalCode", right_on = "Postal Code").drop(['Postal Code'], axis = 1)

In [358]:
toronto_merged.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


<br>
<br>
<br>

# Part III: Clustering in Toronto

Setting credentials for Foursquare

In [15]:
CLIENT_ID = 'RUKMAZKNGQ1HS5HKFP5LO4GS5T2GLW54P3ZKQNRGXVHZGI22' # your Foursquare ID
CLIENT_SECRET = '3Q0DM5NMNNGL14GJQAETVEYO0XLKXVB3VFAVSHYXKPZ3MCBQ' # your Foursquare Secret
ACCESS_TOKEN = 'LII3I5BDGRNT5OAZFNMP1KEJDPY2W31TFE3N32ZCQ432OELV' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 100

Transforming the dataframe, so that each Neighborhood gets its own column. For this, I iterate over each row and count the number of Commas in the Neighborhood column. Per Comma, I add one row with the  Neighborhood after the respective Comma. 

In [30]:
toronto_clean = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])

for index in toronto_merged.index:
    postcode = toronto_merged.loc[index,'PostalCode']
    borough = toronto_merged.loc[index,'Borough']
    neighborhood = toronto_merged.loc[index,'Neighborhood']
    if ',' in neighborhood:
        for i in range(neighborhood.count(',')+1):
            nh = neighborhood.split(', ')[i].strip(" ")
            toronto_clean = toronto_clean.append({'PostalCode': postcode, 'Borough': borough, 'Neighborhood': nh}, ignore_index = True)
    else:
        toronto_clean = toronto_clean.append({'PostalCode': postcode, 'Borough': borough, 'Neighborhood': neighborhood}, ignore_index = True)


Merging the transformed dataframe with the Latitudes and Longitudes from before

In [23]:
toronto = toronto_clean.merge(toronto_merged[['PostalCode','Latitude','Longitude']], how ='left', on = 'PostalCode', copy = False)

In [363]:
print('Our Dataframe has now {} rows instead of {}.'.format(toronto.shape[0], toronto_hoods.shape[0]))

Our Dataframe has now 216 rows instead of 103.


We now extract all gastronomical keywords from the categories obtained in the NYC Lab, to determine which categories are actually gastronomical.

In [25]:
keywords = ['Restaurant','BBQ','Bagel','Bakery','Bar','Beer','Bistro','Tea','Beakfast','Burger','Joint','Burrito','Café','Coffee','Cocktail','Deli','Bodega','Diner','Fish','Food','Drink','Chicken','Yoghurt','Pub','Gastropub','Club','Pizza','Place','Poke','Salad','Steak', 'Sushi','Wine']

With this code, I retrieve the restaurants from Foursquare for each individual Neighborhood. 

In [31]:
toronto_venues = pd.DataFrame(columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Name', 'Venue Category'])


for name, lat, lng in zip(toronto['Neighborhood'], toronto['Latitude'], toronto['Longitude']):
    print(name)
        
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, 500, LIMIT)
        
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for v in results: 
        toronto_venues = toronto_venues.append({'Neighborhood': name, 'Neighborhood Latitude': lat, 'Neighborhood Longitude': lng, 'Venue Name': v['venue']['name'], 'Venue Category': v['venue']['categories'][0]['name']}, ignore_index = True)

Parkwoods
Victoria Village
Regent Park
 Harbourfront
Lawrence Manor
 Lawrence Heights
Ontario Provincial Government
Islington Avenue
Malvern
 Rouge
Don Mills North
Parkview Hill
 Woodbine Gardens
Garden District
 Ryerson
Glencairn
West Deane Park
 Princess Gardens
 Martin Grove
 Islington
 Cloverdale
Rouge Hill
 Port Union
 Highland Creek
Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate
 Bloordale Gardens
 Old Burnhamthorpe
 Markland Wood
Guildwood
 Morningside
 West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor
 Wilson Heights
 Downsview North
Thorncliffe Park
Richmond
 Adelaide
 King
Dufferin
 Dovercourt Village
Scarborough Village
Fairview
 Henry Farm
 Oriole
Northwood Park
 York University
The Danforth  East
Harbourfront East
 Union Station
 Toronto Islands
Little Portugal
 Trinity
Kennedy Park
 Ionview
 East Birchmount Park
Bayview Village
Downsview East
The Danforth

Here I save the dataframe, in order to not having to retrieve the Foursquare API again. 

In [34]:
toronto_venues.to_csv('toronto_venues.csv')

Check if venue category contains a gastro keyword and store that information in a new column.

In [53]:
for i, cat in enumerate(toronto_venues['Venue Category']):
    c = 0
    for word in keywords: 
        if word.lower() in cat.lower() and c == 0:
            toronto_venues.loc[i,'Gastro'] = 1
            c += 1
    

Creating a new Dataframe that only contains gastro categories

In [55]:
toronto_gastro = toronto_venues[toronto_venues['Gastro'] == 1]

In [366]:
toronto_gastro.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Category,Gastro
2,Parkwoods,43.753259,-79.329656,Variety Store,Food & Drink Shop,1.0
4,Victoria Village,43.725882,-79.315572,Portugril,Portuguese Restaurant,1.0
5,Victoria Village,43.725882,-79.315572,Tim Hortons,Coffee Shop,1.0
6,Victoria Village,43.725882,-79.315572,Pizza Nova,Pizza Place,1.0
7,Regent Park,43.65426,-79.360636,Roselle Desserts,Bakery,1.0


Dummy-coding venue categories

In [57]:
toronto_dummy = pd.get_dummies(toronto_gastro[['Venue Category']], prefix="", prefix_sep="")
toronto_dummy['Neighborhood'] = toronto_gastro['Neighborhood']


In [58]:
toronto_dummy.head()

Unnamed: 0,Airport Food Court,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bar,Beer Bar,Beer Store,Belgian Restaurant,...,Tea Room,Thai Restaurant,Theme Restaurant,Tibetan Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wings Joint,Neighborhood
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Parkwoods
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Victoria Village
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Victoria Village
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Victoria Village
7,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Regent Park


Creating the final dataframe for which I merge the Dataframe with single Neighborhoods with Postcode and Borough information

In [260]:
toronto_final = toronto[['Neighborhood','PostalCode','Borough']].merge(toronto_gastro[['Neighborhood','Neighborhood Latitude','Neighborhood Longitude']].drop_duplicates(), how = 'left', left_on = 'Neighborhood', right_on = 'Neighborhood', copy = False)
toronto_final = toronto_final.merge(toronto_grouped['Neighborhood'], how = 'right', left_on = 'Neighborhood', right_on = 'Neighborhood')

##Exluding the Neighborhoods that appeared more often in the dataframe.The Neighborhoods that are duplicated are extracted, and then every second entry from the resulting list is exlcuded. This way, only one entry remains, when a Neighborhood occurs twice. This process is repeated two times to account for the case, that a Neighborhood occurs more than twice
toronto_final = toronto_final.drop(list(toronto_final[toronto_final.duplicated(subset=['Neighborhood'],keep=False)].Neighborhood.index)[::2], axis = 0)
toronto_final = toronto_final.drop(list(toronto_final[toronto_final.duplicated(subset=['Neighborhood'],keep=False)].Neighborhood.index)[::2], axis = 0)

In [240]:
toronto_final.head(10)

Unnamed: 0,Neighborhood,PostalCode,Borough,Neighborhood Latitude,Neighborhood Longitude
0,Parkwoods,M3A,North York,43.753259,-79.329656
1,Victoria Village,M4A,North York,43.725882,-79.315572
2,Regent Park,M5A,Downtown Toronto,43.65426,-79.360636
3,Harbourfront,M5A,Downtown Toronto,43.65426,-79.360636
4,Lawrence Manor,M6A,North York,43.718518,-79.464763
5,Lawrence Heights,M6A,North York,43.718518,-79.464763
6,Ontario Provincial Government,M7A,Queen's Park,43.662301,-79.389494
7,Malvern,M1B,Scarborough,43.806686,-79.194353
8,Rouge,M1B,Scarborough,43.806686,-79.194353
9,Don Mills North,M3B,North York,43.745906,-79.352188


Creating a Dataframe for Clustering by grouping the dummy-coded dataframe by Neighborhoods

In [241]:
toronto_grouped = toronto_dummy.groupby('Neighborhood').mean().reset_index()



In [242]:
toronto_grouped.head()

Unnamed: 0,Neighborhood,Airport Food Court,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Bar,Beer Bar,Beer Store,...,Taiwanese Restaurant,Tea Room,Thai Restaurant,Theme Restaurant,Tibetan Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Wings Joint
0,Adelaide,0.0,0.033898,0.016949,0.0,0.0,0.016949,0.016949,0.0,0.0,...,0.0,0.0,0.050847,0.0,0.0,0.0,0.016949,0.0,0.0,0.0
1,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bathurst Quay,0.5,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Beaumond Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bloordale Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Clustering the Neighborhoods based on the relative proportion of venue categories appearing in each Neighborhood

In [62]:
from sklearn.cluster import KMeans
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(toronto_grouped_clustering)

#Shwoing the clustering labels
kmeans.labels_[0:10] 

array([0, 3, 1, 3, 0, 0, 0, 2, 4, 0], dtype=int32)

Inserting the Clustering Labels into my final Dataframe

In [262]:

toronto_final.sort_values('Neighborhood', axis = 0, ascending = True, inplace = True)
toronto_final.insert(0, 'Cluster Labels', kmeans.labels_)


In [258]:
toronto_final.head()

Unnamed: 0,Cluster Labels,Neighborhood,PostalCode,Borough,Neighborhood Latitude,Neighborhood Longitude
44,0,Adelaide,M5H,Downtown Toronto,43.650571,-79.384568
151,3,Albion Gardens,M9V,Etobicoke,43.739416,-79.588437
138,1,Bathurst Quay,M5V,Downtown Toronto,43.628947,-79.39442
149,3,Beaumond Heights,M9V,Etobicoke,43.739416,-79.588437
25,0,Bloordale Gardens,M9C,Etobicoke,43.643515,-79.577201


Extending my final dataframe by Venue Name and Categories to further examine the Clusters

In [285]:
toronto_gastro_final = toronto_gastro.merge(toronto_final[['Cluster Labels', 'PostalCode', 'Borough','Neighborhood']], how = 'left', left_on = 'Neighborhood', right_on = 'Neighborhood', copy = True)

In [368]:
toronto_gastro_final.head(5)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Name,Venue Category,Gastro,Cluster Labels,PostalCode,Borough
0,Parkwoods,43.753259,-79.329656,Variety Store,Food & Drink Shop,1.0,0,M3A,North York
1,Victoria Village,43.725882,-79.315572,Portugril,Portuguese Restaurant,1.0,0,M4A,North York
2,Victoria Village,43.725882,-79.315572,Tim Hortons,Coffee Shop,1.0,0,M4A,North York
3,Victoria Village,43.725882,-79.315572,Pizza Nova,Pizza Place,1.0,0,M4A,North York
4,Regent Park,43.65426,-79.360636,Roselle Desserts,Bakery,1.0,0,M5A,Downtown Toronto


### Examining all 5 Clusters:

I am doing this by counting the appearance of venue categories for each cluster (Limiting the output to 20 categories)

Cluster 1:

In [316]:
toronto_gastro_final.loc[toronto_gastro_final['Cluster Labels'] == 0, 'Venue Category'].value_counts().to_frame().iloc[:20,:].style.bar()



Unnamed: 0,Venue Category
Coffee Shop,340
Café,181
Restaurant,116
Italian Restaurant,72
Bakery,69
Pizza Place,69
Japanese Restaurant,67
Sushi Restaurant,53
Sandwich Place,53
Bar,51


In [317]:
#Naming Cluster 1: 
Cluster_1_name = 'High density: Various Gastronomy with a focus on Cafés'

Cluster 2:

In [311]:
toronto_gastro_final.loc[toronto_gastro_final['Cluster Labels'] == 1, 'Venue Category'].value_counts().to_frame().iloc[:20,:].style.bar()

Unnamed: 0,Venue Category
Bar,20
Airport Food Court,7


In [318]:
#Naming Cluster 2: 
Cluster_2_name = 'Low density: Bars & Airport'

Cluster 3:

In [312]:
toronto_gastro_final.loc[toronto_gastro_final['Cluster Labels'] == 2, 'Venue Category'].value_counts().to_frame().iloc[:20,:].style.bar()

Unnamed: 0,Venue Category
Bakery,9
Deli / Bodega,3


In [319]:
#Naming Cluster 3: 
Cluster_3_name = 'Low density: Bakery & Deli'

Cluster 4: 

In [314]:
toronto_gastro_final.loc[toronto_gastro_final['Cluster Labels'] == 3, 'Venue Category'].value_counts().to_frame().iloc[:20,:].style.bar()

Unnamed: 0,Venue Category
Pizza Place,34
Fast Food Restaurant,33
Sandwich Place,32
Fried Chicken Joint,11
Beer Store,9
Burrito Place,6
Burger Joint,6
Coffee Shop,6
Chinese Restaurant,6
Italian Restaurant,5


In [320]:
#Naming Cluster 4: 
Cluster_4_name = 'Medium density: Various Gastronomy with a focus on Fast Food and Restaurants'

Cluster 5:

In [315]:
toronto_gastro_final.loc[toronto_gastro_final['Cluster Labels'] == 4, 'Venue Category'].value_counts().to_frame().iloc[:20,:].style.bar()

Unnamed: 0,Venue Category
American Restaurant,3


In [321]:
#Naming Cluster 5: 
Cluster_5_name = 'Very low density: American Restaurants'

I append the generated Cluster Names to my final Dataframe

In [325]:
clusternames = [Cluster_1_name,Cluster_2_name,Cluster_3_name,Cluster_4_name,Cluster_5_name]
toronto_final['Cluster Names'] = pd.NA
for i in range(len(clusternames)):
    toronto_final['Cluster Names'][toronto_final['Cluster Labels'] == i] = clusternames[i]

In [326]:
toronto_final.head()

Unnamed: 0,Cluster Labels,Neighborhood,PostalCode,Borough,Neighborhood Latitude,Neighborhood Longitude,Cluster Names
44,0,Adelaide,M5H,Downtown Toronto,43.650571,-79.384568,High density: Various Gastronomy with a focus ...
151,3,Albion Gardens,M9V,Etobicoke,43.739416,-79.588437,Medium density: Various Gastronomy with a focu...
138,1,Bathurst Quay,M5V,Downtown Toronto,43.628947,-79.39442,Low density: Bars & Airport
149,3,Beaumond Heights,M9V,Etobicoke,43.739416,-79.588437,Medium density: Various Gastronomy with a focu...
25,0,Bloordale Gardens,M9C,Etobicoke,43.643515,-79.577201,High density: Various Gastronomy with a focus ...


Now I create a map of Toronto showing the clusters in different colours. In the popup, the Neighborhood Name and the respective Cluster Name are shown

In [None]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [343]:
# Map with toronto coordinates
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(5)
ys = [i + x + (i*x)**2 for i in range(5)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, hood, cluster, cluster_names in zip(toronto_final['Neighborhood Latitude'], toronto_final['Neighborhood Longitude'], toronto_final['Neighborhood'], toronto_final['Cluster Labels'], toronto_final['Cluster Names']):
    html = '''<b>''' + str(hood) + '''</b> <br> <br>''' + str(cluster_names)
    iframe = folium.IFrame(html, width=200, height=200)
    label = folium.Popup(iframe, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The End.