# Step 1

In [2]:
!pip install beautifulsoup4

[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [200]:
from bs4 import BeautifulSoup
import urllib2
import pandas as pd
'''
Get the source of the Wikipedia page of Canadian Postal codes
'''
page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = urllib2.urlopen(page)
source = response.read()
'''
Parse the page and extract the table of the postal codes. 
Generate the header of the dataframe by looking at the th tag
'''
soup = BeautifulSoup(source, 'html.parser')
table = soup.find_all('table')[0]
headers = table.find_all('th')
df_headers = []
for header in headers:
    temp_string = header.contents[0]
    df_headers.append(temp_string.replace('\n','')) 

In [201]:
'''
Generate the content of the dataframe by looking at the td tag 
NOTE: rows of the dataframe correspond to 3 'td' tags.
NOTE: some of the cells may contain an 'a' tag. If so, remove the 'a' tag
NOTE: some of the strings end with '\n'. Remove it
'''
contents = table.find_all('td')
df_data = []
i = 0
temp = []
for content in contents:
    if(i==3):
        i=0
        df_data.append(temp)
        temp = []
    if(content.find_all('a')):
        sub_element = content.find_all('a')
        temp_string = sub_element[0].contents[0]
        temp.append(temp_string.replace('\n',''))
    else:
        temp_string = content.contents[0]
        temp.append(temp_string.replace('\n',''))
    i = i+1

In [202]:
'''
Generate the dataframe and take a look at the result
'''
df = pd.DataFrame (data= df_data, columns = df_headers)

In [203]:
'''
Remove the rows where Borough is not assigned
NOTE: Set inplace = True, the data removed will not be used anymore
'''
df.drop(df[df.Borough == "Not assigned"].index, inplace=True)

In [204]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [210]:
'''
More than one neighborhood can exist in one postal code area. 
For example, in the table on the Wikipedia page, you will notice that 
M5A is listed twice and has two neighborhoods: Harbourfront and Regent 
Park. These two rows will be combined into one row with the neighborhoods 
separated with a comma
'''

cleaned_df = df.groupby(['Postcode','Borough'], as_index=False).agg(lambda x: ",".join(x.tolist()))

In [211]:
cleaned_df[cleaned_df.Neighbourhood == "Not assigned"]

Unnamed: 0,Postcode,Borough,Neighbourhood
85,M7A,Queen's Park,Not assigned


In [212]:
'''
If a cell has a borough but a Not assigned neighborhood, then the neighborhood
will be the same as the borough. So for the 9th cell in the table on the 
Wikipedia page, the value of the Borough and the Neighborhood columns will be 
Queen's Park.
'''
cleaned_df.Neighbourhood.replace("Not assigned",cleaned_df.Borough,inplace=True)

In [214]:
cleaned_df.shape

(103, 3)

# Step 2

In [216]:
!pip install geocoder

[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [221]:
'''
Geocoder is not working. Dataset https://cocl.us/Geospatial_data used.
'''
'''
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format("M5G"))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
'''

'\nimport geocoder # import geocoder\n\n# initialize your variable to None\nlat_lng_coords = None\n\n# loop until you get the coordinates\nwhile(lat_lng_coords is None):\n  g = geocoder.google(\'{}, Toronto, Ontario\'.format("M5G"))\n  lat_lng_coords = g.latlng\n\nlatitude = lat_lng_coords[0]\nlongitude = lat_lng_coords[1]\n'

In [225]:
df_geocode = pd.read_csv('../datasets/Geospatial_Coordinates.csv')
df_geocode

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [228]:
conjuct_df = pd.merge(cleaned_df, df_geocode, left_on='Postcode', right_on='Postal Code', how='left')
conjuct_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",M1K,43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",M1L,43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",M1M,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",M1N,43.692657,-79.264848
