## Import libraries:

In [13]:
import pandas as pd
import numpy as np
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## 1. Get the data into a usable form (dataframe):

In [2]:
#Search the target website for tables and read it into a df with pandas
URL01 = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

df_Toronto = pd.read_html(URL01)[0] #The [0] element restricts the the search result to the first table
df_Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
#Clean the df:
#Remove 'Not assigned' boroughs
df_Toronto = df_Toronto[df_Toronto.Borough != 'Not assigned']
df_Toronto = df_Toronto.reset_index(drop=True)

#Sort by postal code
df_Toronto.sort_values(by=['Postal Code'])
df_Toronto.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [4]:
df_Toronto.shape

(103, 3)

## 2. Get geographical information for each postal code by importing data from .csv

In [5]:
#Import df
path01 = 'Raw_Data/Geospatial_Coordinates.csv'
df_Toronto_latlng = pd.read_csv(path01)
df_Toronto_latlng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
#Merge both df's based on the postal code
df_Toronto = pd.merge(df_Toronto, df_Toronto_latlng, on="Postal Code")
df_Toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## 3. Explore & cluster the dataset
-> Only boroughs that include the word 'Toronto' will be used

In [7]:
#Remove any boroughs that do not include the word 'Toronto'
df_Toronto = df_Toronto[df_Toronto.Borough.str.contains('Toronto')]
df_Toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


-> Display neighborhoods with folium

In [8]:
#Since geocode did not work, the coordinates were taken from https://www.latlong.net/place/toronto-on-canada-27230.html
location_Toronto_lat = 43.651070
location_Toronto_lng = -79.347015

#Create the map
coord_Toronto = folium.Map(location=[location_Toronto_lat, location_Toronto_lng],zoom_start=12)

#Go through the Toronto dataframe and superimpose it onto the map
for lat,lng,borough,neighbourhood in zip(df_Toronto['Latitude'],df_Toronto['Longitude'],df_Toronto['Borough'],df_Toronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(coord_Toronto)

#Visualize the map
coord_Toronto

-> Cluster Boroughs with k-means

In [9]:
#Number of clusters
k=5

#K-means clustering
cluster_Toronto = df_Toronto.drop(['Postal Code','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(cluster_Toronto)
kmeans.labels_
df_Toronto.insert(0, 'Cluster Labels', kmeans.labels_)

-> Display clusters on map

In [17]:
#Create the map
coord_Toronto_cluster = folium.Map(location=[location_Toronto_lat, location_Toronto_lng],zoom_start=12)

#Define cluster color scheme
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Superimpose markers onto the map
markers_colors = []
for lat, lng, neighbourhood, cluster in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Neighbourhood'], df_Toronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(coord_Toronto_cluster)

#Display map
coord_Toronto_cluster