In this notebook we will be scraping data from a webpage and perform segmentation of neighborhoods on folium maps

In [4]:
# Libraries

import requests
from bs4 import BeautifulSoup

In [8]:
import urllib.request

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()

#with open('List_of_postal_codes_of_Canada:_M.html', 'w') as fo:
 #   fo.write(article)
    
from bs4 import BeautifulSoup

# Load article, turn into soup and get the <table>s.
#article = open('ISO_3166-1_alpha-2.html').read()
soup = BeautifulSoup(article, 'html.parser')
tables = soup.find_all('table', class_='sortable')

# Search through the tables for the one with the headings we want.
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighbourhood']:
        break    

In [9]:
import pandas as pd
# define the dataframe columns
column_names = ['Postcode', 'Borough', 'Neighborhood'] 

rowsList = []
for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        Postcode, Borough, Neighbourhood = [td.text.strip() for td in tds[:4]]
        rowsList.append([Postcode, Borough, Neighbourhood])

In [10]:
# instantiate the dataframe
neighborhoods = pd.DataFrame(rowsList, columns=column_names)

Here, we perform data cleaning and data wrangling as per instructions given

In [11]:
neighborhoods = neighborhoods[neighborhoods.Borough != "Not assigned"]

In [12]:
neighborhoods.loc[neighborhoods.Neighborhood == 'Not assigned', 'Neighborhood'] = neighborhoods.Borough

In [13]:
neighborhoods.head(15)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [14]:
neighborhoods['Neighborhood'] = neighborhoods[['Postcode','Neighborhood','Borough']].groupby(['Postcode','Borough'])['Neighborhood'].transform(lambda x: ','.join(x))
neighborhoods = neighborhoods[['Postcode','Neighborhood','Borough']].drop_duplicates()

Coupling rows neighborhoods with common zipcode, as per instructions given

In [15]:
neighborhoods.head()

Unnamed: 0,Postcode,Neighborhood,Borough
2,M3A,Parkwoods,North York
3,M4A,Victoria Village,North York
4,M5A,"Harbourfront,Regent Park",Downtown Toronto
6,M6A,"Lawrence Heights,Lawrence Manor",North York
8,M7A,Queen's Park,Queen's Park


In [16]:
neighborhoods.shape

(103, 3)

In [20]:
import pandas as pd
LatLongFile = pd.read_csv('C:\\Users\\saich\\Downloads\\Geospatial_Coordinates.csv')
LatLongFile.rename(columns = {"Postal Code":"Postcode"}, inplace=True)
LatLongFile.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Above file provides lat and long locations to our dataset

In [27]:
#GeoFull = neighborhoods.join(LatLongFile, on="PostCode")
GeoFull = pd.merge(neighborhoods, LatLongFile, on='Postcode', how='outer')
GeoFull.head()

Unnamed: 0,Postcode,Neighborhood,Borough,Latitude,Longitude
0,M3A,Parkwoods,North York,43.753259,-79.329656
1,M4A,Victoria Village,North York,43.725882,-79.315572
2,M5A,"Harbourfront,Regent Park",Downtown Toronto,43.65426,-79.360636
3,M6A,"Lawrence Heights,Lawrence Manor",North York,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In the above file we have combined both the datasets and have required lat and long columns 