In [5]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [6]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'html5lib')

In [9]:
# create an dictionary 
postal_codes_dict = {} 
for table_cell in soup.find_all('td'):
    try:
        postal_code = table_cell.p.b.text # get the postal code
        neighborhoods_data = table_cell.span.text # get the rest of the data in the cell
        borough = neighborhoods_data.split('\n')[0] # get the borough in the cell
        
        # if the cell is not assigned then ignore it
        if neighborhoods_data == 'Not assigned':
            neighborhoods = []
        # else process the data and add it to the dictionary
        else:
            postal_codes_dict[postal_code] = {}
            
            neighborhoods = neighborhoods_data.split('\n')[1:]
            # if cell has only borough then set borough and neighborhood to same value
            if len(neighborhoods) == 0:
                neighborhoods = borough
            # else borough is first line after postal code and neighborhood is everything else in cell
            else:
                
                for ind, neighborhood_name in enumerate(neighborhoods):
                    if neighborhood_name[0] == '(':
                        marked_ind = ind
                        break
                neighborhoods_to_be_cleaned = ' '.join(neighborhoods[marked_ind:])[1:-1].split('/')
                neighborhoods_to_be_cleaned = [name for neighborhood_name in neighborhoods_to_be_cleaned for name in neighborhood_name.split(',')]
                neighborhoods_cleaned = [neighborhood.strip() for neighborhood in neighborhoods_to_be_cleaned]
                neighborhoods = neighborhoods[0:marked_ind] + neighborhoods_cleaned
                neighborhoods = ", ".join(neighborhoods)
                
            # add borough and neighborhood to dictionary
            postal_codes_dict[postal_code]['borough'] = borough
            postal_codes_dict[postal_code]['neighborhoods'] = neighborhoods
    except:
        pass

canada_html = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", header=0)[0]
df = pd.DataFrame(data = canada_html)

df.rename(columns={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)
df.Neighborhood = df.Borough.where(df.Neighborhood == 'Not assigned', df.Neighborhood)
df = df[(df[['Borough','Neighborhood']] != 'Not assigned').all(axis=1)]
  

df1 = df.groupby(by=['PostalCode','Borough'])['Neighborhood'].apply(lambda tags: ', '.join(tags))

df = df1.to_frame().reset_index()
df.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood]], Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [10]:
df.shape

(103, 3)

In [11]:
df.to_excel('Canada.xls')