In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim 
import folium

## Scrape the page

Wikipedia isn't available in my region. I put the scraping codes in the following cell and use the html I've retrieved from other sources.

In [2]:
# request the content
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 8.0; Pixel 2 Build/OPD3.170816.012) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Mobile Safari/537.36'}
rsp = requests.get(url, headers=headers)
print(rsp.status_code)
html = request.text

ConnectionError: HTTPSConnectionPool(host='en.wikipedia.org', port=443): Max retries exceeded with url: /wiki/List_of_postal_codes_of_Canada:_M (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x116390f98>: Failed to establish a new connection: [Errno 60] Operation timed out'))

## Extract data from html


In [3]:
# import html
with open('./wiki_html.txt', 'r') as f:
    html = f.read()

# define soup
soup = BeautifulSoup(html)

# extract the text of all p nodes
rows = soup.tbody.find_all('p')
rows = [i.text for i in rows]

# split the text of all rows
codes = [row[0:3] for row in rows]
others = [row[3:] for row in rows]

# create dataframe
df = pd.DataFrame(np.transpose([codes, others]), columns=['PostalCode', 'Borough'])
df.Borough = df.Borough.str.strip('\n')
df = df[df.Borough!='Not assigned']

# split the borough and neighborhood 
df['Neighborhood'] = df.Borough.str.extract('(\(.*?\))')

# clean up the columns
df.Neighborhood = df.Neighborhood.str.replace('\(|\)| /', '')
df.Borough = df.Borough.str.replace('(\(.*?\))', '')

df.shape

(103, 3)

###  Get the coordinates using csv database

In [4]:
# read the csv database to df
geo_df = pd.read_csv('~/Downloads/Geospatial_Coordinates.csv', index_col='Postal Code')
print(geo_df.describe())
# match postal codes
df['Latitude'] = df.PostalCode.map(lambda x: geo_df.loc[x, 'Latitude'])
df['Longitude'] = df.PostalCode.map(lambda x: geo_df.loc[x, 'Longitude'])


         Latitude   Longitude
count  103.000000  103.000000
mean    43.704608  -79.397153
std      0.052463    0.097146
min     43.602414  -79.615819
25%     43.660567  -79.464763
50%     43.696948  -79.388790
75%     43.745320  -79.340923
max     43.836125  -79.160497


### Map the neighborhoods

In [5]:
# get the coordinates of Toronto
address = 'Toronto, ON'

geolocator = Nominatim(user_agent='myloc', timeout=10)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [6]:
# create map of trt 
map_trt = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_trt)  
    
map_trt