## 1. Scraping Data

In [None]:
# Importing necessary libraries
import pandas as pd # for dataframes
import requests # to make HTTP requests
from bs4 import BeautifulSoup # for scraping

In [80]:
# Make request to get HTML content via the specific URL:
req = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=942851379") # old version of website is used
# Create a BeautifulSoup object and define the parser (lxml)
soup = BeautifulSoup(req.content,'lxml')
# Create table from html table
table = soup.find_all('table')[0]
# Create a dataframe from table
df = pd.read_html(str(table))

neighborhood=pd.DataFrame(df[0]) 

In [81]:
neighborhood.head(15)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [82]:
neighborhood.rename(columns={'Postcode': 'PostalCode'}, inplace=True)
neighborhood.head(20)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [83]:
neighborhood.shape

(287, 3)

## 2. Cleaning Dataframe

In [84]:
#  Drop rows with a Borough = Not assigned
neighborhood.drop(neighborhood[neighborhood['Borough']=="Not assigned"].index,axis=0, inplace=True)
neighborhood.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [85]:
neighborhood.shape

(210, 3)

More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma.

If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [87]:
# Combining neighbourhoods with same Postal code
neighborhood2 = neighborhood.groupby(['PostalCode','Borough'], sort=False).agg(', '.join)
# Resetting index
neighborhood2.reset_index(inplace=True)

# Replacing the name of the neighbourhoods 'Not assigned' with name of Borough
neighborhood2['Neighbourhood'] = np.where(neighborhood2['Neighbourhood'] == 'Not assigned',neighborhood2['Borough'], neighborhood2['Neighbourhood'])

neighborhood2.head(15)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [88]:
neighborhood2.shape

(103, 3)