## Scraping Toronto Neighborhoods data from web.

In [1]:
# importing library which is used to open URLs.
import urllib.request

In [2]:
# specifying the URL we are going to scrape.
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# open url and put the html data in page variable.
page = urllib.request.urlopen(url)

In [4]:
# import the BeautifulSoup library so we can parse HTML and XML documents.
from bs4 import BeautifulSoup

In [5]:
# parse the HTML in our url into BeautifulSoup Parse tree format
soup = BeautifulSoup(page,'lxml')

In [6]:
# let's see the page title and the data between start and end title tags.
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [7]:
# Let's extract only the string content of title without the <title> tags
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [8]:
# use find_all function to get all instances of <table> tag and store it in a variable
all_tables = soup.find_all('table')

In [9]:
table = soup.find('table',class_='wikitable sortable')


In [10]:
# There are three columns in our table that we want to scrape the data from so we will set up three empty lists (A, B,and c) to store our data in.
A = []
B = []
C = []
# use the Beautiful Soup ‘find_all’ function again and set it to look for the string ‘tr’. 

for row in table.findAll('tr'): #  set loop through the rows, one by one.
    
    #  use find_all again to search each row for <td> tags with the ‘td’ string.
    # We will add to a variable called ‘cell’ and then check to make sure that there are 3 items in our ‘cell’ array (i.e. one for each column).

    cell = row.findAll('td') 
    
    # we use the find(text=True)) option to extract the content string from within each <td> element in that row and add them to the A-C lists.
    if(len(cell)==3):
        A.append(cell[0].find(text=True))
        B.append(cell[1].find(text=True))
        C.append(cell[2].find(text=True))

## Creating dataframe from the extracted data.

In [11]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [12]:
# creating a dataframe 
import pandas as pd
df = pd.DataFrame(A,columns=['PostalCode'])
df['Borough'] = B
df['Neighborhood'] = C
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,\n
1,M2A\n,Not assigned\n,\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n


In [13]:
# getting rid of '\n' characters in the cells
df['Borough'] = df['Borough'].replace('\n','', regex=True)
df['PostalCode'] = df['PostalCode'].replace('\n','', regex=True)
df['Neighborhood'] = df['Neighborhood'].replace('\n','', regex=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [14]:
# Removing the rows with Borough column having 'Not assigned'
df_filtered = df.loc[df.Borough != "Not assigned"]
df_toronto = df_filtered.reset_index().drop(['index'],axis=1)
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [15]:
# Replacing the seperator '/' in Neighborhood column with ','
df_toronto['Neighborhood'] = df_toronto['Neighborhood'].replace('/',',', regex=True)
df_toronto.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [16]:
df_toronto.shape

(103, 3)