Workbook scraping data from Wikipedia into dataframe

Load all needed libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Get the page

In [2]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = requests.get(wikipedia_link).text

In [3]:
soup = BeautifulSoup(wiki_page,'lxml')

Find table on page and all rows from table

In [4]:
toronto_pc_tab = soup.table    # postcodes table
pc_rows = toronto_pc_tab.find_all('tr')

In [5]:
postcodes=[]
boroughs=[]
neighborhoods=[]

Process all table rows, ignore Not Assigned Boroughs.
Move Borough name into Neighborhood if Not Assigned.

In [6]:
for row in pc_rows[1:]:
    
    postcode = str(row.find_all('td')[0].text)
    borough = str(row.find_all('td')[1].text)
    neighborhood = str(row.find_all('td')[2].text)
    neighborhood=neighborhood.strip('\n')
    if borough != 'Not assigned':
        postcodes.append(postcode)
        boroughs.append(borough)
        if neighborhood != 'Not assigned':
            neighborhoods.append(neighborhood)
        else:
            neighborhoods.append(borough)

Build data frame then group by Post Codes and Boroughs - join all Neighborhoods into one list.

In [7]:
cols = {'PostalCode':postcodes,'Borough':boroughs,'Neighborhood':neighborhoods}
df = pd.DataFrame(cols)

dfc = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

dfc.head()



Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
dfc.shape

(103, 3)

# BEGINNING OF SECOND TASK

Load csv data into dataframe as the geocoder was not working

In [9]:
import requests
import csv

csv_url ='http://cocl.us/Geospatial_data'

with requests.Session() as s:
    download = s.get(csv_url)

decoded_content = download.content.decode('utf-8')

cr = csv.reader(decoded_content.splitlines(), delimiter=',')
crl = list(cr)
pcodes = pd.DataFrame(crl[1:],columns=crl[0])

Align indices

In [10]:
dfc = dfc.set_index('PostalCode')
pcodes = pcodes.set_index('Postal Code')

Concatenate two dataframes along Postal Codes to get lat, lon data in

In [11]:
dfcl = pd.concat([dfc,pcodes], axis=1).reset_index()
dfcl = dfcl.rename(columns={'index':'PostalCode'})

dfcl.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8066863,-79.1943534
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
