In [1]:
# The code was removed by Watson Studio for sharing.

# Segmenting and Clustering Neighborhoods in Toronto

## Part 1: Scraping and Cleaning

### Scraping the wikipage

Let's start by importing the libraries necessary for web scraping.

In [2]:
# Importing libraries
import pandas as pd
import requests # for making standard html requests

Next up: scraping the page using pd.read_html()

In [3]:
# Request data target URL
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
df = pd.read_html(url, header = 0)

In [4]:
# Assign the table to the variable df
df = df[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data Cleaning

Not interested in Boroughs that are not assigned. Clear those out.

In [5]:
# Borough should not be 'Not assigned'
print('Count "Not assigned" in Borough: {}'.format(list(df['Borough'] == "Not assigned").count(True)))
df = df.iloc[list(df['Borough'] != "Not assigned")].reset_index().drop(['index'], axis = 1)
df.head()

Count "Not assigned" in Borough: 77


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Checking if there are any Neighbourhoods with the value 'Not assigned'.

In [6]:
#There are no cells with the value 'Not assigned'
print('Count "Not assigned" in Neighbourhood: {}'.format(list(df['Neighbourhood'] == "Not assigned").count(True)))

Count "Not assigned" in Neighbourhood: 0


Checking for duplicates. If the returned dictionary is empty, there are none found.

In [7]:
def getDuplicatesWithCount(listOfElems):
    ''' Get frequency count of duplicate elements in the given list '''
    dictOfElems = dict()
    # Iterate over each element in list
    for elem in listOfElems:
        # If element exists in dict then increment its value else add it in dict
        if elem in dictOfElems:
            dictOfElems[elem] += 1
        else:
            dictOfElems[elem] = 1    
 
    # Filter key-value pairs in dictionary. Keep pairs whose value is greater than 1 i.e. only duplicate elements from list.
    dictOfElems = { key:value for key, value in dictOfElems.items() if value > 1}
    # Returns a dict of duplicate elements and thier frequency count
    return dictOfElems

In [8]:
# There are no duplicates.
dictOfElems = getDuplicatesWithCount(list(df['Postal Code']))
dictOfElems

{}

Checking shape.

In [9]:
print('Number of rows: {}'.format(df.shape[0]))

Number of rows: 103


## Part 2: assigning geographical coordinates

In [10]:
# The code was removed by Watson Studio for sharing.

In [11]:
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

In [15]:
list_lat = []
list_long = []
geolocator = Nominatim(user_agent="foursquare_agent")
location = None
i = 0

for postal_code in df['Postal Code']:
    location = None
    address = '{}, Toronto, Ontario'.format(postal_code)
    geolocator = Nominatim(user_agent="foursquare_agent")
    while(location is None and i < 10):
        location = geolocator.geocode(address)
        i = i + 1
    if location != None:
        list_lat.append(location.latitude)
        list_long.append(location.longitude)
    else:
        list_lat.append(None)
        list_long.append(None)

In [16]:
print(list_lat)
print(list_long)

[43.6534817, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]
[-79.3839347, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, N

Only received the first coordinates. None of the others.

### Geocoder

Let's try out the Geocoder package and the geocoder.google function. First let's install geocoder

In [21]:
#!pip install geocoder

In [22]:
# Importing geocoder
import geocoder

The code performs as follows:

1. Initiliazation of values:
    1. Lists that store our values.
    1. i is to count the number of times we have tried a certain postal code. It is to prevent an infinite loop.
    1. lat_lng_coords temporarily stores the coordinates of a single postal code
    1. g stores the API call
1. A for loop will go over every postal code in the data frame
    1. The initial values will be reset
    1. The address will be formatted
    1. The address will be called in a while loop, using i as a safety
    1. If the API has found the coordinates, they will be appended to their respective lists. If not, None values will be appended.

In [None]:
list_lat = []
list_long = []
i = 0
lat_lng_coords = None
g = None

for postal_code in df['Postal Code']:
    lat_lng_coords = None
    g = None
    i = 0
    address = '{}, Toronto, Ontario'.format(postal_code)
    while(lat_lng_coords is None and i < 2):
        g = geocoder.google(address)
        lat_lng_coords = g.latlng
        i = i + 1
    if lat_lng_coords != None:
        list_lat.append(lat_lng_coords[0])
        list_long.append(lat_lng_coords[1])
    else:
        list_lat.append(None)
        list_long.append(None)
    print(address)

In [None]:
print(list_lat)
print(list_long)

Didn't obtain any data either. In that case, let us proceed using the provided data.

### Data Merge using provided data

Since we can't obtain the information with Geocoder either. We will continue using the data provided by the instructor.

In [24]:
# Code if we can't get any coordinates with Geocoder. Project is the project called from a hidden cell using project_lib.
geospatial_coordinates = project.get_file("Geospatial_Coordinates.csv")
geospatial_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Now that we have acquired the latitude and longitude data. Let's merge the data into a single dataframe.

In [25]:
df_full = df
df_full = df_full.merge(right = geospatial_coordinates, on = 'Postal Code')
df_full.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
