# import library

In [1]:
import pandas as pd
import numpy as np
import bs4
import requests

# Get the html from the website

In [2]:
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
wiki.raise_for_status()
wiki_txt = bs4.BeautifulSoup(wiki.text, 'html.parser')

# take off tr tag

In [4]:
data = []
for tr in wiki_txt.tbody.find_all('tr'):
    data.append([td.get_text().strip() for td in tr.find_all('td')])

# preprocessing & check

## make dataframe

In [5]:
wiki = pd.DataFrame(data, columns=['Postal Code','Borough','Neighborhood'])

## drop the data what we dont need

In [6]:
wiki = wiki.dropna()

In [7]:
wiki = wiki[wiki['Borough'] != 'Not assigned']

## grouping

In [8]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
wiki_temp = wiki.groupby(['Postal Code', 'Borough'])
wiki_group = wiki_temp.apply(neighborhood_list).reset_index(name='Neighborhood')

## Not assigned neighborhood will be the same as the borough

In [9]:
wiki_group.loc[wiki_group['Neighborhood'] == 'Not assigned','Neighborhood'] = wiki_group.loc[wiki_group['Neighborhood'] == 'Not assigned','Borough']

## check

In [10]:
wiki.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [11]:
wiki.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211 entries, 3 to 287
Data columns (total 3 columns):
Postal Code     211 non-null object
Borough         211 non-null object
Neighborhood    211 non-null object
dtypes: object(3)
memory usage: 6.6+ KB


# shape

In [12]:
wiki_group.shape

(103, 3)

# to csv for next step

In [13]:
wiki_group.to_csv('wiki_group.csv')