#                       Capstone Project for DataScience
##       The segmenting and clustering of the neighborhoods in 
##                               the City of Toronto, Canada
##                                              Part2
##            Combined the base DataFrame formed by scrpaing BeatifuSoup
##                       with their latitude and longitude information.
###                                         Krishna Yellamilli 06/30/2020

Let's start off by importing the necessary basic libraries to get compile the web info

In [1]:
import numpy as np # For mult-dimensional arrays usage

import pandas as pd # For DataFrames usage

import requests # Enough for handling web requests with REST APIs
import urllib.request # Needed for free https sites including WikiPedia
from urllib.request import urlopen # Needed for free https sites including WikiPedia

import json #For handling json file
from pandas.io.json import json_normalize # to transform json into pd.DataFrame

from bs4 import BeautifulSoup # for processing html docs

import geocoder # for getting the geolocation information from a post code
from geopy.geocoders import Nominatim # for getting the geolocation from an address

import matplotlib.cm as cm # for color maps
import matplotlib.colors as colors # for converting numbers and color arguments to RGB and RGBA schemes

from sklearn.cluster import KMeans # for clustering KMeans module

# Ensured FourSquare APIs are available

import folium # for maps

print("Importing all necessary modules/libraries completed.")

Importing all necessary modules/libraries completed.


Let's remove any limitations that hinder our interactive work just in case

In [2]:
pd.set_option('display.max_columns', None) # Removing display limitattion on columns
pd.set_option('display.max_rows', None) # Removing display limitattion on rows

Let's start get rolling by getting the info from the Wikipedia on Toronto's neighborhood info using the provided web page 

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')
#print(soup.prettify()) for debug purpose

Let's prepare the DataFrame infrastcture starting with the table column headers

In [4]:
#initializing the columns
columns = []

for tag in soup.find_all('th'): # Looping through the table column headers
    if (tag.string): # To avoid Null tag.srings
        columns.append(tag.string.rstrip())
columns

['Postal Code', 'Borough', 'Neighborhood']

Let's gather the table data in the order of post code, burough, and grouped neighborhoods

In [16]:
pc_hood = {} # There can be more than one neighborhoold in a post code
pc_brh = {} # The burough associated with the post code
i = 0 # initializing to keep track of the table rows with data
for tag in soup.find_all('td'): # Looping through the table data
    i_mod = i % 3
    if i_mod == 0:
        post_code = tag.string
    elif i_mod == 1:
        brh = tag.string
    elif i_mod == 2:
        if post_code and (brh.rstrip().upper() != 'NOT ASSIGNED'): # ignoring entries with a borough "Not Assigned"
            brh=brh.rstrip()
            nbrhood = tag.string.rstrip()
            post_code = post_code.rstrip()
            if nbrhood.upper() == 'NOT ASSIGNED': nbrhood = brh
            if post_code in pc_hood.keys():
                pc_hood[post_code] = pc_hood[post_code]+", " + nbrhood
            else: pc_hood[post_code] = nbrhood
            if post_code not in pc_brh.keys():
                pc_brh[post_code] = brh
            else: # Checking to see if a post code ends up in two different buroughs
                print("ERROR: post code {} is already associated with {} burough".format(post_code, pc_brh[post_code]))
                print("       post code {} is now being associated with {} burough".format(post_code, brh))
    i += 1 # preparing to index the next table entry
    
if (i % 3 != 0): #Error checking for conistency of processing 3 columns per row in table
    print("The table columns is not a multiple of 3")
    print("{}".format(i))

pc_lists =[] # Initializing the list
#for key in sorted(pc_brh.keys()): # Populating the list with processed table rows in the post code order for debug
for key in pc_brh.keys(): # Populating the list with processed table rows in the keys order
    pc_lists.append([key, pc_brh[key], pc_hood[key]])

T_DF=pd.DataFrame(pc_lists, columns=columns) # Initializing the dataframe
T_DF

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
# Checking the size of the Toronot post code DataFrame size
T_DF.shape

(103, 3)

Gathering the latitude and longitudes for the post codes

In [7]:
#Turning this code to markdown as 20 times LIMIT 
pc_lat = {} # for storing postal code's lattitude
pc_lng = {} # for storing postal code's longitude
pcs = 0
for pc in pc_brh.keys():
    lat_lng = None # Initial state
    i = 0 # Iteratio monitoring
    LIMIT = 20 # limit setting for the number of iterations
    while (lat_lng is None):
        g = geocoder.google('{} Toronto, Ontario, Canada'.format(pc)) # Using google module
        lat_lng = g.latlng
        if i > LIMIT: 
            print("Exceeded {} tries for the postal code {} look up with idex of {}.".format(i+1, pc, pcs))
            break
        else: i += 1
    if (lat_lng is not None):
        pc_lat[pc] = lat_lng[0]
        pc_lng[pc] = lat_lng[1]
    pcs += 1
for pc in pc_lat.keys():
    print ("{} {} {} {} {}".format(key, pc_brh[pc], pc_hood[pc], pc_lat[pc], pc_lng[pc]))
        

Exceeded 22 tries for the postal code M3A look up with idex of 0.
Exceeded 22 tries for the postal code M4A look up with idex of 1.
Exceeded 22 tries for the postal code M5A look up with idex of 2.
Exceeded 22 tries for the postal code M6A look up with idex of 3.
Exceeded 22 tries for the postal code M7A look up with idex of 4.
Exceeded 22 tries for the postal code M9A look up with idex of 5.
Exceeded 22 tries for the postal code M1B look up with idex of 6.
Exceeded 22 tries for the postal code M3B look up with idex of 7.
Exceeded 22 tries for the postal code M4B look up with idex of 8.
Exceeded 22 tries for the postal code M5B look up with idex of 9.
Exceeded 22 tries for the postal code M6B look up with idex of 10.
Exceeded 22 tries for the postal code M9B look up with idex of 11.
Exceeded 22 tries for the postal code M1C look up with idex of 12.
Exceeded 22 tries for the postal code M3C look up with idex of 13.
Exceeded 22 tries for the postal code M4C look up with idex of 14.
Excee

Status code Unknown from https://maps.googleapis.com/maps/api/geocode/json: ERROR - HTTPSConnectionPool(host='maps.googleapis.com', port=443): Read timed out. (read timeout=5.0)


Exceeded 22 tries for the postal code M2R look up with idex of 72.
Exceeded 22 tries for the postal code M4R look up with idex of 73.
Exceeded 22 tries for the postal code M5R look up with idex of 74.
Exceeded 22 tries for the postal code M6R look up with idex of 75.
Exceeded 22 tries for the postal code M7R look up with idex of 76.
Exceeded 22 tries for the postal code M9R look up with idex of 77.
Exceeded 22 tries for the postal code M1S look up with idex of 78.
Exceeded 22 tries for the postal code M4S look up with idex of 79.
Exceeded 22 tries for the postal code M5S look up with idex of 80.
Exceeded 22 tries for the postal code M6S look up with idex of 81.
Exceeded 22 tries for the postal code M1T look up with idex of 82.
Exceeded 22 tries for the postal code M4T look up with idex of 83.
Exceeded 22 tries for the postal code M5T look up with idex of 84.
Exceeded 22 tries for the postal code M1V look up with idex of 85.
Exceeded 22 tries for the postal code M4V look up with idex of

Multiple tries did not work with Google at different times on different days; hence used the provided csv data.

In [13]:
pc_lat_lng = pd.read_csv("C:\\Users\\krish\\DataScience\\projects\\capstone\\Geospatial_Coordinates.csv", header=0)
pc_lat_lng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [17]:
# Merging both DFs
T_DF=T_DF.join(pc_lat_lng.set_index('Postal Code'), on='Postal Code')
T_DF

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [18]:
T_DF.shape

(103, 5)