# Part 1: Scraping and building a Dataframe 

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Grabbing relevant information from URL

In [2]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup = BeautifulSoup(source,"lxml")
d = soup.find("table", class_ ="wikitable sortable").text

Converting table information from scraped text

In [3]:
post_neigh  = {} # postcode : neighbor. neighbor will be a long string of neighborhoods connected by comma
post_bor = {} # postcode : borough

postcode = []
borough = []
neigh = []
index = 7

unfiltered = d.split("\n")
while index + 2 < len(unfiltered):
    if unfiltered[index+1] != "Not assigned":
        n = unfiltered[index+2]
        if n == "Not assigned":
            n = unfiltered[index+1]
            
        if unfiltered[index] in post_neigh.keys():
            post_neigh[unfiltered[index]] += f", {n}"
        else:
            post_neigh[unfiltered[index]] = n
            post_bor[unfiltered[index]] = unfiltered[index+1]
    index += 5
for key, value in post_bor.items():
    postcode.append(key)
    borough.append(value)
    neigh.append(post_neigh[key])

Creation of Dataframe

In [4]:
df = pd.DataFrame({
    "Postal Code": postcode,
    "Borough": borough,
    "Neighborhood": neigh
})

In [5]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [6]:
df.shape

(103, 3)

# Part 2: Creating Dataframe for Coordinates and joining it with the previous Dataframe

I will be using the Geospatial csv file provided since it is easier and guranteed to get the respective coordinates

In [7]:
coord_df = pd.read_csv("Geospatial_Coordinates.csv")

Reading of Geospatial csv file which gives us the following dataframe

In [8]:
coord_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Left join the previous dataframe with the coordinates dataframe whereby the Postal Code are similar

In [9]:
new_df = pd.merge(df, coord_df, how="left", on=["Postal Code"])

In [10]:
new_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509


# Part 3: Exploration and clustering of the neighborhoods in Toronto

Creating a new DataFrame whereby Borough column contains Toronto.

In [11]:
import folium
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [12]:
toronto = new_df[new_df.Borough.str.find("Toronto") != -1]

In [13]:
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
5,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


Since different parts of Toronto has different coordinates, we will be taking the mean of all the coordinates to find the coordinates of Toronto.

In [14]:
latitude = toronto.Latitude.mean()
longitude = toronto.Longitude.mean()

Visualising the map of Toronto

In [15]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto['Latitude'], toronto['Longitude'], toronto['Borough'], toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Run k-means to cluster the neighborhood into 5 clusters.

In [16]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto[["Latitude","Longitude"]])

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 4, 3, 3, 2, 3, 3, 0, 3, 0], dtype=int32)

In [17]:
toronto.insert(0, 'Cluster Label', kmeans.labels_)

In [18]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood'], toronto['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [19]:
toronto

Unnamed: 0,Cluster Label,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,3,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
5,4,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
9,3,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,2,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,3,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,0,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,3,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568
31,0,M6H,West Toronto,"Dovercourt Village, Dufferin",43.669005,-79.442259
