## In this notebook, I will scrape data from Wikipedia, then create a pandas dataframe with the table scraped

In [1]:
#install Beautiful Soup and requests for Web Scaping
!pip install BeautifulSoup4
!pip install requests



In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

#get html from wiki page and create soup object
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(source.text, 'lxml')

#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')

for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
toronto_df = pd.DataFrame(data = data,columns = columns)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### Drop cells with Borough that are Not assigned

In [5]:
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


#### Combine rows with the same PostCode, separate with commas

In [9]:
toronto_df["Neighbourhood"] = toronto_df.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))

#remove duplicates
toronto_df = toronto_df.drop_duplicates()
toronto_df = toronto_df.reset_index(drop=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


#### If the neighbourhood is Not assigned, then the neighbouhood will be the same as borough

In [10]:
toronto_df['Neighbourhood'].replace('Not assigned', toronto_df['Borough'], inplace=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [11]:
toronto_df.shape

(103, 3)