### Applied DS Capstone - My Submission - Part III - Segmenting and Clustering Neighborhoods in Toronto

Importing pandas library

In [1]:
import pandas as pd

Reading the html file in the Canadian Postal Codes site 

In [2]:
path = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df_Wikipedia = pd.read_html(path)
type(df_Wikipedia)

list

In [3]:
TNeighborhood = df_Wikipedia[0]
TNeighborhood.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# Changing column names
TNeighborhood.columns = ['PostalCode', 'Borough', 'Neighborhood']
print(type(TNeighborhood))
TNeighborhood.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Eliminating "Not assigned" Boroughs and merging Neighborhoods with the same PotalCode

In [5]:
TNeighborhood = TNeighborhood.drop([0])
TNeighborhood = TNeighborhood[TNeighborhood.Borough !='Not assigned']
TNeighborhood = pd.DataFrame(TNeighborhood)
TNeighborhood.set_index(['PostalCode','Borough'],inplace=True)
merge_samepc = TNeighborhood.groupby(level=['PostalCode','Borough'], sort=False).agg( ','.join)
TNeighborhood = merge_samepc.reset_index()
TNeighborhood

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Checking if a "Not assigned" Neighborhood is still present in final TNeighborhood dataframe

In [6]:
df_NANeighborhood = TNeighborhood.Neighborhood == "Not assigned"
df_NANeighborhood.sum()

0

In [7]:
TNeighborhood.shape

(103, 3)

Retrieving the Geographical Coordinates of the Postal Codes

In [8]:
path2 = 'http://cocl.us/Geospatial_data'
df_Geo = pd.read_csv(path2)
print(type(df_Geo))
df_Geo.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Changing the Postal Code column name to Postcode

In [9]:
df_Geo = df_Geo.rename(columns = {'Postal Code':'PostalCode'})
df_Geo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Structuring the final dataframe

In [10]:
# Merging the dataframes
frames = [TNeighborhood,df_Geo]
frames = pd.concat(frames, axis=1, sort=False)
# Merging the two columns on a common 'Postcode'
TNeighborhood_final = pd.merge(TNeighborhood, df_Geo, left_on='PostalCode', right_on='PostalCode')
TNeighborhood_final

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


__Geopy library is used to get Toronto's latitude and longitude values__

In order to define an instance of the geocoder, an user_agent "t_explorer" is created.

In [11]:
import folium
import geopy
# Converting an address into latitude and longitude
from geopy.geocoders import Nominatim 

address = 'Toronto, CA'
geolocator = Nominatim(user_agent="t_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Creating a map of Toronto using latitude and longitude and plotting all boroughs and neighborhoods

In [12]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Adding markers to map
for lat, lng, borough, neighborhood in zip(TNeighborhood_final['Latitude'], TNeighborhood_final['Longitude'],
TNeighborhood_final['Borough'], TNeighborhood_final['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

Simplifying the above map: only the neighborhoods of __Boroughs that contain the word "Toronto"__ will be displayed, slicing the original dataframe.

In [13]:
df_OnlyToronto = TNeighborhood_final[TNeighborhood_final['Borough'].str.contains('Toronto')].reset_index(drop=True)
# Eliminating 'Postcode' column
df_OnlyToronto = df_OnlyToronto.drop(['PostalCode'], axis=1)
df_OnlyToronto

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,Downtown Toronto,St. James Town,43.651494,-79.375418
4,East Toronto,The Beaches,43.676357,-79.293031
5,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,Downtown Toronto,Christie,43.669542,-79.422564
8,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [14]:
map_OnlyToronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Adding markers to map
for lat, lng, borough, neighborhood in zip(df_OnlyToronto['Latitude'], df_OnlyToronto['Longitude'],
df_OnlyToronto['Borough'], df_OnlyToronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_OnlyToronto)  
    
map_OnlyToronto