In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

## First Step
Scraping the html page using BeautifulSoup as learned in Module 5 (new version of the IBM Data Science Course)

In [2]:
html_data_model = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
html_data = html_data_model.text
soup = BeautifulSoup(html_data, 'html.parser') #soup object to scrap the page

## Second Step
Look for the 'tbody' object (the one we need is the first one on the wiki page), then find all 'td' (table data) objects inside the tbody.  
Get only the text part from 'td', delete \n character that comes with it and append each text into the list l.  
Convert the list into a dataframe 180 x 3

In [3]:
l = []

for td in soup.find('tbody').find_all('td'):
    data = td.text
    data = data.replace('\n', '')
    l.append(data)
     
rows = int((len(l)/3)) # number of rows is the lengh of the list / number of columns (converted to int)
l = np.reshape(l, (rows, 3)) # reshaping the list into matrix

df = pd.DataFrame(l, columns=["Postal Code", "Borough", "Neighbourhood"])
df.shape

(180, 3)

## Third Step
Cleaning the dataframe, removing 'Not assigned' from 'Borough' column and copying the values from 'Borough' column to 'Neighbourhood' column.  
It was not necessary to combine the rows in the Neighborhood column as the table already be like this on the wiki page. 

In [4]:
#change string 'not assigned' in the 'Borough' column for NaN values and then drop them out
df['Borough'].replace('Not assigned', np.nan, inplace=True)
df.dropna(subset=['Borough'], inplace=True)

#change string 'not assigned' in the 'Neighbourhood' column for NaN values and then
#fill the rows with values from 'Borough' column  
df['Neighbourhood'].replace('Not assigned', np.nan, inplace=True)
df['Neighbourhood'] = df['Neighbourhood'].fillna(df['Borough'])

#reseting index
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [5]:
#saving the dataframe as a scv file
df.to_csv('L:\PythonProjects\week_3\datasets\df.csv', index = False)

In [6]:
df.shape

(103, 3)