### First Let's import required libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
from pandas.io.json import json_normalize  # transform json files to pandas dataframes
from geopy.geocoders import Nominatim # 
import numpy as np
import csv
!pip install folium
import folium

print('All modules imported')

[31mtensorflow 1.3.0 requires tensorflow-tensorboard<0.2.0,>=0.1.0, which is not installed.[0m
All modules imported


## Let's start scraping the wikipedia page

In [2]:
# The wikipedia site link
site_link = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

Get the source code html data from the website

In [3]:
source = requests.get(site_link).text

Lets Use BeautifulSoup to parse it

In [4]:
soup = BeautifulSoup(source, 'lxml')

#print(soup.prettify())

Next let's get the table that contains the data we want to scrape

In [5]:
My_table = soup.find('table',{'class':'wikitable sortable'})

Let's view the table data

In [6]:
# Uncomment below to view table
# My_table

we can see that all the data we want are between the $<td>$ brackets, let's get the data between the td brackets

In [7]:
links = My_table.find_all('td')

In [9]:
# uncomment below to view links
# print(links)

Next let's loop through links and extract only the text elements

In [15]:
text_links = []

for link in links:
    text_links.append(link.text)
    
# uncommnet below to view text_links    
#text_links

### let's clean and process the table elements

Let's clean the links and keep only rows with Borough. Out of which we shall rename rows without Neighborhood as Boroughs

In [16]:
cleaned_links = []

while True:
    
    if len(text_links) < 3:
        break
    
    sub = text_links[:3]
    # If 'Not ' in borough then skip that row of data
    if 'Not ' in sub[1]:
        text_links = text_links[3:]
    else:
        cleaned_links.append(text_links[:3])
        
        # Let's strip off the \n at the end of each neighborhood data
        cleaned_links[-1][-1] = cleaned_links[-1][-1].strip('\n')
        
        # If the Borough is available but the Neighborhood is missing
        # make Neighborhood same as Borough
        if 'Not ' in cleaned_links[-1][-1]:
            cleaned_links[-1][-1] = cleaned_links[-1][-2]
        text_links = text_links[3:]
# Uncomment below to view cleaned_links       
#cleaned_links

lets check the length of the cleaned links

In [17]:
len(cleaned_links)

211

Next let's add the neighborhood data of each duplicate Postal Codes together to the first instance or row that contains the PostalCode

In [18]:
link = []
for i in range(len(cleaned_links)):
    x = cleaned_links[i][0]
    if x in link:
        cleaned_links[link.index(x)][-1] += ', ' + cleaned_links[i][-1]
    link.append(x)
    
# uncomment below
#cleaned_links

Next let's pass the cleaned _links to a data frame and set index to postal code so that we can easily work on it

In [19]:
df = pd.DataFrame(cleaned_links, columns=['PostalCode','Borough','Neighborhood'])
df.index= df.PostalCode

In [20]:
# Let's view the data frame
df.head()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M3A,M3A,North York,Parkwoods
M4A,M4A,North York,Victoria Village
M5A,M5A,Downtown Toronto,"Harbourfront, Regent Park"
M5A,M5A,Downtown Toronto,Regent Park
M6A,M6A,North York,"Lawrence Heights, Lawrence Manor"


Next let's Use the pandas duplicate method to drop duplicate index

In [21]:
df = df.loc[~df.index.duplicated(keep='first')]

In [22]:
# Let's see the shape so far
df.shape

(103, 3)

In [23]:
df.head()

Unnamed: 0_level_0,PostalCode,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M3A,M3A,North York,Parkwoods
M4A,M4A,North York,Victoria Village
M5A,M5A,Downtown Toronto,"Harbourfront, Regent Park"
M6A,M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,M7A,Queen's Park,Queen's Park


Next let's reset the index back and drop the current index

In [25]:
df.reset_index(drop=True, inplace=True)

# Let's see the first few rows
df.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [26]:
df.shape

(103, 3)

### Appending the Latitude and Longitude data 

Let's define a simple method that we can apply to each Borough to get its Latitude and Longitude using the apply() method

In [27]:
def latitude_longitude(Borough):
    """ Method takes a Series object and returns
    a list of Latitude and corresponding Longitude data,
    using the geopy library.
    This method also prints out the coordinate data"""
    
    address = Borough
    
    geolocator = Nominatim(user_agent="CA_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))
    return [latitude, longitude]


Let's append the list containing corresponding lat and lon data to column Latitude

In [29]:
df['Latitude'] = df.Borough.apply(latitude_longitude)

The geograpical coordinate of North York are 43.7708175, -79.4132998.
The geograpical coordinate of North York are 43.7708175, -79.4132998.
The geograpical coordinate of Downtown Toronto are 43.6541737, -79.3808116451341.
The geograpical coordinate of North York are 43.7708175, -79.4132998.
The geograpical coordinate of Queen's Park are 43.6599803, -79.3903686.
The geograpical coordinate of Etobicoke are 43.67145915, -79.5524920661167.
The geograpical coordinate of Scarborough are 54.2847601, -0.4090339.
The geograpical coordinate of North York are 43.7708175, -79.4132998.
The geograpical coordinate of East York are 43.6913391, -79.3278212.
The geograpical coordinate of Downtown Toronto are 43.6541737, -79.3808116451341.
The geograpical coordinate of North York are 43.7708175, -79.4132998.
The geograpical coordinate of Etobicoke are 43.67145915, -79.5524920661167.
The geograpical coordinate of Scarborough are 54.2847601, -0.4090339.
The geograpical coordinate of North York are 43.77081

In [30]:
# Lets see the updated data with Latitude containing lists of lats and lons data

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude
0,M3A,North York,Parkwoods,"[43.7708175, -79.4132998]"
1,M4A,North York,Victoria Village,"[43.7708175, -79.4132998]"
2,M5A,Downtown Toronto,"Harbourfront, Regent Park","[43.6541737, -79.3808116451341]"
3,M6A,North York,"Lawrence Heights, Lawrence Manor","[43.7708175, -79.4132998]"
4,M7A,Queen's Park,Queen's Park,"[43.6599803, -79.3903686]"


Next let's loop through the data frame and separate Latitude from Longitude and make lat and lons just numbers not lists

In [31]:
lon_list = []
for i, j in df.iterrows():
    lon_list.append(j.Latitude[1])
    j.Latitude = j.Latitude[0]
    
# next let's assign the lon_list as the value of the Longitude Column

df['Longitude'] = lon_list

In [33]:
# let's view the changes

df.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7708,-79.413300
1,M4A,North York,Victoria Village,43.7708,-79.413300
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.6542,-79.380812
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.7708,-79.413300
4,M7A,Queen's Park,Queen's Park,43.66,-79.390369
5,M9A,Etobicoke,Islington Avenue,43.6715,-79.552492
6,M1B,Scarborough,"Rouge, Malvern",54.2848,-0.409034
7,M3B,North York,Don Mills North,43.7708,-79.413300
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.6913,-79.327821
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.6542,-79.380812
