This notebook is apart of the IBM Data Science Capstone on Coursera. The goal is to segment and cluster different neighborhoods in Toronto. I do this by scraping web data from wikipedia about different boroughs and then access the foursquare API to explore different places within those boroughs for the cluster analysis.

In [143]:
# webscraping
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as ureq

# for gathering latitude and longitude
from geopy.geocoders import Nominatim

# for progress bar (current latitude, longitude processing is kind of slow)
from tqdm import tqdm

import numpy
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes
import folium

In [144]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [145]:
# open connection and grab page
uClient = ureq(url)

In [146]:
html = uClient.read()


In [147]:
uClient.close()

In [148]:
# parse html
soup = soup(html, "html.parser")

In [149]:
df_dict = {"PostalCode":[], "Borough":[], "Neighborhoods":[]}
for tr in soup.find_all('tr'):
    tds = tr.find_all('td')
    br = False
    for td in tds:
        # make sure we are still in the main table
        if td.b is None or td.b.text[0] != 'M':
            br = True
            break
        # make sure postal code is assigned
        curr_td_code = td.b.text
        if td.span is not None:
            txt = td.span.text
            spl = txt.strip(")").split("(")
            borough = spl[0]
            
            # special case
            if borough == "Queen's Park\n":
                #print("q's park")
                df_dict["PostalCode"].append(td.findAll('b')[0].text)
                df_dict["Borough"].append("Queen's Park")
                df_dict["Neighborhoods"].append([td.findAll('b')[1].text])
            
            elif borough != 'Not assigned':
                df_dict["PostalCode"].append(td.b.text)
                # special cases (makes lat, long processing smoother)
                if td.b.text == "M4J":
                    df_dict["Borough"].append("East York")
                elif td.b.text == "M7R":
                    df_dict["Borough"].append("Mississauga")
                elif td.b.text == "M5W":
                    df_dict["Borough"].append("Downtown Toronto")
                elif td.b.text == "M7Y":
                    df_dict["Borough"].append("East Toronto")
                elif td.b.text == "M9W":
                    df_dict["Borough"].append("Etobicoke")
                else:
                    df_dict["Borough"].append(borough)
                neighborhoods = []
                # for loop because some cells have multiple paranethesis 
                for i in range(1,len(spl)):
                    hoods = spl[i].replace(")", " ").replace(",", "/").split("/")
                    for hood in hoods:
                        neighborhoods.append(hood.strip())
                df_dict["Neighborhoods"].append(neighborhoods)
    # abort mission once we leave the main table
    if br:
        break

In [150]:
df = pd.DataFrame.from_dict(df_dict)
df = df.reindex(columns=["PostalCode", "Borough", "Neighborhoods"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods
0,M3A,North York,[Parkwoods]
1,M4A,North York,[Victoria Village]
2,M5A,Downtown Toronto,"[Regent Park, Harbourfront]"
3,M6A,North York,"[Lawrence Manor, Lawrence Heights]"
4,M7A,Queen's Park,[Ontario Provincial Government]


In [151]:
df.shape

(103, 3)

In [152]:
lats = []
lons = []
geolocator = Nominatim(user_agent="GeocodeEarth")
with tqdm(total=len(list(df.iterrows()))) as pbar:
    for index,row in df.iterrows():
        pbar.update(1)
        # handle special case
        if row['Borough'] == "Queen's Park":
            location = geolocator.geocode("Queen's Park, Toronto")
        else:
            for i in range(len(row['Neighborhoods'])):
                location = geolocator.geocode(row['Borough'] + ', ' + row['Neighborhoods'][i])
                if location != None:
                    break
            if location == None:
                location = geolocator.geocode(row['Borough'] + ', Toronto')
        #print(index, location.address)
        lats.append(location.latitude)
        lons.append(location.longitude)
    
df["Latitude"] = lats
df["Longitude"] = lons
df.head(10)

100%|██████████| 103/103 [01:17<00:00,  1.22it/s]


Unnamed: 0,PostalCode,Borough,Neighborhoods,Latitude,Longitude
0,M3A,North York,[Parkwoods],43.761224,-79.323986
1,M4A,North York,[Victoria Village],43.732658,-79.311189
2,M5A,Downtown Toronto,"[Regent Park, Harbourfront]",43.661752,-79.35684
3,M6A,North York,"[Lawrence Manor, Lawrence Heights]",43.722079,-79.437507
4,M7A,Queen's Park,[Ontario Provincial Government],43.65998,-79.390369
5,M9A,Etobicoke,[Islington Avenue],43.714904,-79.554973
6,M1B,Scarborough,"[Malvern, Rouge]",43.809196,-79.221701
7,M3B,North York,[Don Mills North],43.737178,-79.343451
8,M4B,East York,"[Parkview Hill, Woodbine Gardens]",43.712078,-79.302567
9,M5B,Downtown Toronto,"[Garden District, Ryerson]",43.652722,-79.376828


In [157]:
location = geolocator.geocode("Toronto")
lat = location.latitude
lon = location.longitude

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[lat, lon], zoom_start=11)

# add markers to map
for lat, lon, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhoods']):
    label = '{}, {}'.format(neighborhood[0].strip(), borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto