get HTML

In [1]:
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import requests
from pandas.io.json import json_normalize
import json

url='https://en.wanweibaike.com/wiki-List%20of%20postal%20codes%20of%20Canada:%20M'
res=requests.get(url).text
soup=BeautifulSoup(res, 'html.parser')

create dataframe from HTML

In [2]:
postalCodeList=[]
boroughList=[]
neighborhoodList=[]

for row in soup.find('table').find_all('tr'):
    cells=row.find_all('td')
    if(len(cells)>0):
        postalCodeList.append(cells[0].text.rstrip('\n'))
        boroughList.append(cells[1].text.rstrip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n'))

Toronto_neighborhood=[('PostalCode', postalCodeList), ('Borough', boroughList), ('Neighborhood', neighborhoodList)]
Toronto_df=pd.DataFrame.from_dict(dict(Toronto_neighborhood))
Toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Drop the row where Borough is Not assigned

In [3]:
Toronto_df_drop=Toronto_df[Toronto_df.Borough!='Not assigned'].reset_index(drop=True)
Toronto_df_drop.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


Group neighborhood by borough

In [4]:
Toronto_df_group=Toronto_df_drop.groupby(['PostalCode', 'Borough'], as_index=False).agg(lambda x:','.join(x))
Toronto_df_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Assign Neighborhood which is not assigned to the same name of Borough

In [5]:
NA_rows=Toronto_df_group.Neighborhood=='Not assigned'
Toronto_df_group.loc[NA_rows, 'Neighborhood']=Toronto_df_group.loc[NA_rows, 'Borough']
Toronto_df_group[NA_rows]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [6]:
Toronto_df=Toronto_df_group
Toronto_df.shape

(103, 3)

Get csv from the given url

In [7]:
Co_df=pd.read_csv(r"C:\\Users\\JOY\\Desktop\\Geospatial_Coordinates.csv")
print(Co_df.shape)
Co_df.head()

(103, 3)


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


merge two dataframes

In [8]:
Toronto_df_before=Toronto_df.set_index('PostalCode')
Co_df_before=Co_df.set_index('Postal Code')
Toronto_df_after=pd.concat([Toronto_df_before, Co_df_before], axis=1, join='inner')
Toronto_df_after.index.name='PostalCode'
Toronto_df_after.reset_index(inplace=True)
Toronto_df_after.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


get location

In [10]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

address='Toronto, Ontario'
geolocator=Nominatim(user_agent='t1-toronto-neigh')
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude
print('The coordinates of Toronto are {}, {}'.format(latitude, longitude))
map_Toronto=folium.Map(location=[latitude, longitude], zoom_start=11)

The coordinates of Toronto are 43.6534817, -79.3839347


Add borough maker

In [11]:
for lat, long, post, borough, neigh in zip(Toronto_df_after['Latitude'], Toronto_df_after['Longitude'], Toronto_df_after['PostalCode'], Toronto_df_after['Borough'], Toronto_df_after['Neighborhood']):
    label="{} ({}): {}".format(borough, post, neigh)
    popup=folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long], 
        radius=5, 
        popup=popup, 
        color='blue', 
        fill=True, 
        fill_color='#3186cc', 
        fill_opacity=0.7, 
        parse_html=False
    ).add_to(map_Toronto)
    
map_Toronto

Only select borough which is called Toronto

In [13]:
Toronto_boroughs=['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
Toronto_central_df=Toronto_df_after[Toronto_df_after['Borough'].isin(Toronto_boroughs)].reset_index(drop=True)
print(Toronto_central_df.shape)
Toronto_central_df.head()

(38, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [14]:
map_Toronto=folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, long, post, borough, neigh in zip(Toronto_central_df['Latitude'], Toronto_central_df['Longitude'], Toronto_central_df['PostalCode'], Toronto_central_df['Borough'], Toronto_central_df['Neighborhood']):
    label="{} ({}): {}".format(borough, post, neigh)
    popup=folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long], 
        radius=5, 
        popup=popup, 
        color='blue', 
        fill=True, 
        fill_color='#3186cc', 
        fill_opacity=0.7, 
        parse_html=False).add_to(map_Toronto)
    
map_Toronto