In [1]:
import numpy as np 
import pandas as pd 
import json 
from pandas.io.json import json_normalize 
from geopy.geocoders import Nominatim 
import requests 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium  

print('Libraries imported.')

Libraries imported.


Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe.

In [2]:
d= pd.read_html('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&direction=prev&oldid=926287641', header = 0)

In [3]:
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
d[0].columns = ['Postal Code', 'Borough', 'Neighborhood']
d[0]

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


In [4]:
df = d[0]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal Code   288 non-null    object
 1   Borough       288 non-null    object
 2   Neighborhood  288 non-null    object
dtypes: object(3)
memory usage: 6.9+ KB


In [5]:
# Only process the cells that have an assigned borough. 
# Ignore cells with a borough that is Not assigned.
drop_index = df.index[df['Borough'] == 'Not assigned']
df = df.drop(drop_index)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


In [6]:
# combined M5A rows into one row with the neighborhoods separated with a comma.
df1 = df.groupby("Postal Code").agg(lambda x:','.join(set(x)))
df1

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
M1E,Scarborough,"Morningside,West Hill,Guildwood"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"St. Phillips,Martin Grove Gardens,Kingsview Vi..."
M9V,Etobicoke,"Thistletown,Mount Olive,South Steeles,Albion G..."


In [7]:
df1.to_csv('file_name.csv')

In [8]:
# If a cell has a borough but a Not assigned neighborhood, 
# the neighborhood will be the same as the borough.
df1.loc[df1['Neighborhood'] == "Not assigned",'Neighborhood'] = df1.loc[df1['Neighborhood'] == "Not assigned",'Borough']
df1

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
M1E,Scarborough,"Morningside,West Hill,Guildwood"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"St. Phillips,Martin Grove Gardens,Kingsview Vi..."
M9V,Etobicoke,"Thistletown,Mount Olive,South Steeles,Albion G..."


In [9]:
# use the .shape method to print the number of rows of the dataframe.
df1.shape

(103, 2)

In [10]:
df_postal = pd.read_csv('Geospatial_Coordinates.csv')
df_postal

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


# Merge Borough information with Latitude & Longitude information

In [11]:
df2 = pd.merge(df1, df_postal, how="inner", on = "Postal Code")
df2

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,West Hill,Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"St. Phillips,Martin Grove Gardens,Kingsview Vi...",43.688905,-79.554724
101,M9V,Etobicoke,"Thistletown,Mount Olive,South Steeles,Albion G...",43.739416,-79.588437


# Filtering the rows from the data frame which contains "Toronto" in Borough column.


In [12]:
df3 = df2[df2['Borough'].str.contains('Toronto')]
df3

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Summerhill East,Moore Park",43.689574,-79.38316
49,M4V,Central Toronto,"South Hill,Deer Park,Rathnelly,Summerhill West...",43.686412,-79.400049


# Visualizing the Toronto's neighborhoods using Folium

In [13]:
map_toronto = folium.Map(location=[43.6532, -79.3832],zoom_start = 10)

for lat,lng,borough,neighborhood in zip(df3['Latitude'],df3['Longitude'],df3['Borough'],df3['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius = 5,
    popup = label,
    color = 'red',
    fill = True,
    fill_color = '#ff0000',
    fill_opacity = 0.5,
    parse_html = False).add_to(map_toronto)
    

map_toronto


# KMeans clustering to see the the neighboring area

In [14]:
k = 5

# The new dataframe only with Latitude & Longtitude.
toronto_cluster = df3.drop(['Postal Code','Borough','Neighborhood'], 1)   
kmeans = KMeans(n_clusters = k, random_state = 0).fit(toronto_cluster)
kmeans.labels_ 

df3.insert(0, 'Cluster Labels', kmeans.labels_)

df3

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,4,M4K,East Toronto,"Riverdale,The Danforth West",43.679557,-79.352188
42,4,M4L,East Toronto,"India Bazaar,The Beaches West",43.668999,-79.315572
43,4,M4M,East Toronto,Studio District,43.659526,-79.340923
44,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,2,M4T,Central Toronto,"Summerhill East,Moore Park",43.689574,-79.38316
49,2,M4V,Central Toronto,"South Hill,Deer Park,Rathnelly,Summerhill West...",43.686412,-79.400049


In [15]:
# creation of the map
map_clusters = folium.Map(location=[43.6532, -79.3832],zoom_start = 10)

# color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []

for lat, lon, neighborhood, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Neighborhood'], df3['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity = 0.5).add_to(map_clusters)
    
map_clusters