### Install and import required libraries and save the Wikipedia page into an object Toronto_postcodes

In [1]:
!pip install BeautifulSoup4



In [169]:
import requests
import pandas as pd
import numpy as np

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
Toronto_postcodes = requests.get(url).text


#### Use the BeautifulSoup to parse the page data and extract only the post codes table into a new list _postcodes_.
----

In [188]:
from bs4 import BeautifulSoup

post_soup = BeautifulSoup(Toronto_postcodes, 'html.parser')

In [189]:
postcodes = []
for row in post_soup.find_all("tr"):
    cols = row.find_all("td")
    cols = [ele.text.strip() for ele in cols]
    postcodes.append(cols)
    
postcodes = postcodes[1:]   #remove the first empty row
postcodes[0:10]   #print the first 10 items

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned']]

____
#### After checking the data, remove the data after the postcode **M9Z**. 
#### Convert the list into pandas data frame *df_postcode* with proper column names. Then remove rows with no Borough assigned and finally replace the missing Neghborhoods by the name of the Borough. 
____

In [190]:
last = postcodes.index(['M9Z', 'Not assigned', 'Not assigned'])   #find the index position of the last relevant record and remove the rest. 
postcodes = postcodes[:last+1]
df_postcodes = pd.DataFrame(postcodes,columns=['PostalCode','Borough','Neighborhood'])    #convert the list into Pandas data frame. 

#drop rows with "not assigned" Borough
df_postcodes = df_postcodes[df_postcodes.Borough != 'Not assigned']
print(df_postcodes.shape)

# find rows with "not assigned" Neighborhood and replace with the name of the Borough
df_postcodes['Neighborhood'] = np.where(df_postcodes['Neighborhood'] == 'Not assigned', df_postcodes['Borough'], df_postcodes['Neighborhood'])
df_postcodes[:10]  #show the first 10 rows.
    

(212, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


----
#### Merge the rows with the same postal code using groupby method. 
----

In [191]:
# combine rows with the same postcode 
df_postcodes = df_postcodes.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_postcodes[0:10]   #show the first 10 rows

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [192]:
df_postcodes.shape   #show the size of the data frame.

(103, 3)

----
## Getting latitude and longitude data for each postcode. 

In [8]:
!pip install geocoder
import geocoder 




----
### After trying to use Geocoder.google unsucesfuly, I changed to geocoder for Here maps. This worked very well, but two postcodes, M5W (Stn A PO Boxes 25 The Esplanade) and M7Y (Business reply mail Processing Centre969 Eastern), which are not real neigborhoods, but postal processing centres were causing errors. I decided to drop these rows, as they would anyway not be relevant for the task for clustering and comparing neighborhoods. 

In [193]:
df_postcodes = df_postcodes.drop([69,87]).reset_index(drop=True)

----
### Then using the Here maps API and geocoder I obtained the coordinates for each postcode.

In [185]:

# add new columns to the data frame 
df_postcodes['Latitude']=""
df_postcodes['Longitude']=""

row=0

for row in range(0,len(df_postcodes)):
    g = geocoder.here('{}, Toronto, Ontario'.format(df_postcodes.iat[row,0]), app_id='cvHwUfEGD6sBef4ybHyD',app_code='cDHPp-nV6hHhzfrcoaGoXA')
    lat = g.json['raw']['NavigationPosition'][0]['Latitude']
    long = g.json['raw']['NavigationPosition'][0]['Longitude']
    df_postcodes.at[row,'Latitude'] = lat
    df_postcodes.at[row,'Longitude'] = long


In [187]:
df_postcodes[0:10]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8117,-79.1956
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7856,-79.1587
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7657,-79.1753
3,M1G,Scarborough,Woburn,43.7682,-79.2176
4,M1H,Scarborough,Cedarbrae,43.7696,-79.2394
5,M1J,Scarborough,Scarborough Village,43.7431,-79.2322
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7263,-79.2637
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7132,-79.2849
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.7236,-79.235
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.6967,-79.2601
