In [1]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from sklearn.cluster import KMeans
import folium


In [2]:
#Scrape website
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')

#Output to csv file
csv_file = open('scrape12.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])  

for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    csv_writer.writerow(row)
csv_file.close()
    


    

In [3]:
#Transform csv file into Pandas Dataframe
csv_file = 'scrape12.csv'
df = pd.read_csv(csv_file)


In [4]:
#Ignore cells for Borough which have value 'Not assigned'
df = df[df['Borough'].str.contains('Not assigned') == False]

#Remove \n from rows
df['Neighbourhood'] = df['Neighbourhood'].str.replace('\n', '')

#Regent Park and Harbourfront into one row
df = df[df['Neighbourhood'].str.contains('Harbourfront') == False]
df['Neighbourhood'] = df['Neighbourhood'].str.replace('Regent Park', 'Regent Park, Harbourfront')

#Replace Neighbourhood 'Not assigned' cell with 'Queen's Park
df['Neighbourhood'] = df['Neighbourhood'].str.replace('Not assigned', 'Queen\'s Park')

In [5]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern
14,M3B,North York,Don Mills North


In [6]:
#Transform Latitude Longitude csv into dataframe
csv_file2 = 'http://cocl.us/Geospatial_data'
df2 = pd.read_csv(csv_file2)

In [7]:
#Rename the column heading to match across both dataframes
df2.rename(columns={'Postal Code':'Postcode'}, inplace=True)

In [8]:
#Print no. of rows from dataframe
df2.shape[0]

103

In [9]:
#Merge the dataframes based on Postcode column heading
df3 = pd.merge(df, df2, on='Postcode')

In [10]:
df3

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763
5,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
6,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
7,M1B,Scarborough,Rouge,43.806686,-79.194353
8,M1B,Scarborough,Malvern,43.806686,-79.194353
9,M3B,North York,Don Mills North,43.745906,-79.352188
