# Coursera Applied Data Science Capstone / Week 3 / PART I

Importing Libraries

In [54]:
# import libraries

import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

Now for the scraping

In [19]:
# reading the webpage data
httpurl='https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=954731857'
downloadeddata = requests.get(httpurl)

dataset = downloadeddata.text
hotsoup = BeautifulSoup(dataset,'html.parser')
boroughtable=hotsoup.find('table')
#develop dataframe
downloaded_table = pd.read_html(str(boroughtable))[0]
downloaded_table.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


Processing and preparing dataframe

In [20]:
#Drop first column
downloaded_table.drop(0,inplace=True)
#Rename those columns
downloaded_table.columns = ['PostalCode','Borough','Neighborhood']
downloaded_table.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights


Getting rid of "not assigned" boroughs (cleanin' up)

In [21]:
downloaded_tableII=downloaded_table[downloaded_table['Borough'].str.contains("Not assigned") == False].reset_index()
downloaded_tableII.head()

Unnamed: 0,index,PostalCode,Borough,Neighborhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Regent Park / Harbourfront
3,5,M6A,North York,Lawrence Manor / Lawrence Heights
4,6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


If PostalCode and Borough have the same value combine them

In [28]:
downloaded_tableIII= downloaded_tableII.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
downloaded_tableIII.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


How much data is in here

In [43]:
downloaded_tableIII.shape

(103, 5)

#PART II with geospatial coordinates

Getting the data from the geospatial csv

In [37]:
url='https://cocl.us/Geospatial_data/Geospatial_Coordinates.csv'
GeoFrame=pd.read_csv(url)
GeoFrame.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Cleaning up and renaming the columns

In [39]:
GeoFrame = GeoFrame.rename(columns = {'Postal Code':'PostalCode'}) 
GeoFrame.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [None]:
Going to merge the two sets based on the Postal column

In [41]:
downloaded_tableIII = pd.merge(downloaded_tableIII, GeoFrame, on = 'PostalCode')
downloaded_tableIII.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


How much data is in here?  We have to validate it has the same number of rows to make sure the union went off all right

In [42]:
downloaded_tableIII.shape

(103, 5)

Still same number of rows -- still in business

<b> This is Part III </b>

In [50]:
# Getting rid of any boroughs outside of Toronto!!

FinalSetPre=downloaded_tableIII[downloaded_tableIII['Borough'].str.contains('Toronto')]
FinalSet=FinalSetPre.reset_index(drop=True)
FinalSet.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [51]:
# Make a label marker for the final map, based on Borough so it will be easily recognizable
FinalSet['Label']=FinalSet['Borough'].replace(to_replace=['Downtown Toronto','Central Toronto','West Toronto','East Toronto'],value=[1,2,3,4],inplace=False)
FinalSet.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Label
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,4
1,M4K,East Toronto,The Danforth West / Riverdale,43.679557,-79.352188,4
2,M4L,East Toronto,India Bazaar / The Beaches West,43.668999,-79.315572,4
3,M4M,East Toronto,Studio District,43.659526,-79.340923,4
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,2


In [52]:
# Where is Toronto?  We need this to create a base of cluster around toronto
address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The geograpical coordinate of Toronto are {latitude}, {longitude}.')

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [59]:
# finally we create the visualization
kclusters=len(FinalSet.Label.unique())

# create map
MapOfToronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
RAD = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(RAD)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, cluster in zip(FinalSet['Latitude'], FinalSet['Longitude'], FinalSet['Label']):
    label = folium.Popup(str(FinalSet['Borough']) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=4,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(MapOfToronto)

In [60]:
MapOfToronto

All done.  Pretty cool vis.