In [7]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import re

# Toronto ZIP codes and Neighbourhoods

We will collate the data on the different neighbourhoods and postal codes of Toronto from the [Wikipedia page](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M). We will scrape this webpage using requests and BeautifullSoup.

In [8]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html = requests.get(url)
wp = bs(html.text)


This wikipedia entry may contain several tables. Let's check which one is the on we are interested on.

In [9]:
#wp.find_all(name=(lambda x: not x=='html' ),attrs={'class':True})
tables = wp.find_all('table')
len(tables)

5

In [29]:
print(tables[0])

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North

The column headings:

In [11]:
borough_table = tables[0]
headings = [ h.text.strip() for h in borough_table.find_all('th') ]
headings

['Postcode', 'Borough', 'Neighbourhood']

The actual rows.

Here we need to :  
1. filter out all unwanted data entries: Not assigned borough
2. Combine rows w/ == zip, into a csv entry for neighborhood
3. If Neigh == Not assigned => make Neigh = borough
4. Last cell show .shape of df
5. Clean & comment (md) notebook

First, inspect the table as seen by Beautifulsoup for unwanted entries at the boundaries

In [12]:
borough_table.find_all('tr')[:3]

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>]

In [13]:
borough_table.find_all('tr')[-3:]

[<tr>
 <td>M8Z</td>
 <td><a href="/wiki/Etobicoke" title="Etobicoke">Etobicoke</a></td>
 <td>Royal York South West
 </td></tr>, <tr>
 <td>M8Z</td>
 <td><a href="/wiki/Etobicoke" title="Etobicoke">Etobicoke</a></td>
 <td>South of Bloor
 </td></tr>, <tr>
 <td>M9Z</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>]

As the headings are under a parent `tr` tag, we may as well ignore the first match of `tr` in order to list all
table rows.

In order to filter out the non-assigned boroughs, we use two helper functions, `nta` and `mtchBor`. The former checks if a string is `'Not assigned'`. It does so by comparing both in lowercase.

Finally, we will merge together any entries with the same postal code. In this case, different neighbourhoods will be combined together separated by a comma.

In [17]:
# nta : is Not assigned' ? 
def nta(s):
    return re.match('not\s*',s.lower())

# is neighbourhood 'Not assigned'? If so, make it = to its borough
def mtchBor(nei,bor):
    if nta(nei): return bor
    return nei

postalcodes = [ [ td.text.strip()
          for td in tr.find_all('td') 
         ] 
            for tr in borough_table.find_all('tr')[1:] 
                if not nta(tr.find_all('td')[1].text)
       ]

for i,zpc in enumerate(postalcodes):
    zpc[2]=mtchBor(zpc[2],zpc[1])
    postalcodes[i]=zpc
postalcodes[:10]

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"],
 ['M9A', "Queen's Park", "Queen's Park"],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M3B', 'North York', 'Don Mills North']]

Let's now combine multiple neighborhoods w/ the same zip code.

Well use a dictionary, `rows`, as a hash table and then transform it back to a list before
creating the dataframe.

In [19]:
rows = {}
for pc in postalcodes:
    if pc[0] in rows: 
        rows[pc[0]][2] += ', ' + pc[2]
    else:
        rows[pc[0]] = pc 
rows

{'M3A': ['M3A', 'North York', 'Parkwoods'],
 'M4A': ['M4A', 'North York', 'Victoria Village'],
 'M5A': ['M5A', 'Downtown Toronto', 'Harbourfront'],
 'M6A': ['M6A', 'North York', 'Lawrence Heights, Lawrence Manor'],
 'M7A': ['M7A', 'Downtown Toronto', "Queen's Park"],
 'M9A': ['M9A', "Queen's Park", "Queen's Park"],
 'M1B': ['M1B', 'Scarborough', 'Rouge, Malvern'],
 'M3B': ['M3B', 'North York', 'Don Mills North'],
 'M4B': ['M4B', 'East York', 'Woodbine Gardens, Parkview Hill'],
 'M5B': ['M5B', 'Downtown Toronto', 'Ryerson, Garden District'],
 'M6B': ['M6B', 'North York', 'Glencairn'],
 'M9B': ['M9B',
  'Etobicoke',
  'Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park'],
 'M1C': ['M1C', 'Scarborough', 'Highland Creek, Rouge Hill, Port Union'],
 'M3C': ['M3C', 'North York', 'Flemingdon Park, Don Mills South'],
 'M4C': ['M4C', 'East York', 'Woodbine Heights'],
 'M5C': ['M5C', 'Downtown Toronto', 'St. James Town'],
 'M6C': ['M6C', 'York', 'Humewood-Cedarvale'],
 'M9C': 

In [20]:
postalcodes = [ r for k , r in rows.items()]
postalcodes[:5]

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights, Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"]]

Finally, the dataframe containing all desired zip codes with their different neighbourhoods.

In [27]:
zipcs_df = pd.DataFrame(postalcodes,columns=headings)
zipcs_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [28]:
zipcs_df.shape

(103, 3)