### Download and Install necessary libraries

In [1]:
#!conda install beautifulsoup4
#!conda install lxml
#!conda install html5lib
#!conda install requests

# uncomment the above lines to install it in case you don't have it installed

from bs4 import BeautifulSoup
import requests
import pandas as pd

### Scrape the table of postal code in toronto from wikipedia page

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
raw_table = pd.read_html(source, header=0, attrs={"class":"wikitable sortable"})[0]
raw_table

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Rename columns

In [4]:
raw_table = raw_table.rename(columns={"Postcode":"PostalCode", "Borough":"Borough", "Neighbourhood":"Neighborhood"})

###  Ignore cells with a borough that is Not assigned

In [5]:
raw_table = raw_table.drop(raw_table.index[raw_table['Borough']=='Not assigned'])
raw_table

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Remove more than one neighborhood in one postal code area and format it

In [6]:
table=raw_table.groupby('PostalCode', sort=False, as_index=False).agg(lambda x: ', '.join(set(x)))
table

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


### Replace cells that have Neighborhood values Not assigned with Borough value

In [7]:
for i in range(table.shape[0]):
    if table.loc[i,'Neighborhood'] =='Not assigned':
        table.loc[i,'Neighborhood'] = table.loc[i,'Borough']
table

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [8]:
table.shape

(103, 3)

## Create the geocoder dataframe

In [9]:
# uncomment to install if you need it
#!conda install -c conda-forge geocoder --yes

\begin code

import geocoder # import geocoder
######################initialize your variable to None

lat_lng_coords = None

postal_code = 'M8Y'
######################loop until you get the coordinates

while(lat_lng_coords is None):

  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]

longitude = lat_lng_coords[1]

\end code

Trying the code above took longer with no results 

Hence we choose to build the dataframe with the  csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data  

In [10]:
geo_coord_table=pd.read_csv("https://cocl.us/Geospatial_data")
geo_coord_table.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
print('The table dataframe has {} dimension and geo_coord_table dataframe also has the same {} .'.format(table.shape, geo_coord_table.shape)
     ) 

The table dataframe has (103, 3) dimension and geo_coord_table dataframe also has the same (103, 3) .


#### Conform heading of both dataframe


In [12]:
geo_coord_table.rename(columns={"Postal Code":"PostalCode"}, inplace=True)
geo_coord_table.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
geo_coord_table.sort_values(by = ['PostalCode'], inplace=True)
table.sort_values(by = ['PostalCode'], inplace=True)

#### Merge both dataframe

In [14]:
Toronto_df = table.merge(geo_coord_table, left_on='PostalCode', right_on='PostalCode')
Toronto_df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"West Hill, Morningside, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Ionview, Kennedy Park, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Oakridge, Clairlea",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [15]:
Toronto_df.shape

(103, 5)

In [16]:
print('The Toronto_df dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_df['Borough'].unique()),
        Toronto_df.shape[0])
     )

The Toronto_df dataframe has 11 boroughs and 103 neighborhoods.
