In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import folium

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


PART 1

In [3]:
#Scrape webpage, reanme columns
Df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
Df.columns = ["PostalCode", "Borough", "Neighborhood"]
Df.drop(labels = 0, axis = 0, inplace = True)

In [4]:
#Filter out "Not assigned borough"
condition = Df['Borough'] != 'Not assigned'
Df = Df[condition]

In [5]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as 
#the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough 
#and the Neighborhood columns will be Queen's Park.

for i in range(len(Df)):
    if Df.iloc[i]['Neighborhood'] == "Not assigned":
        Df.iloc[i]['Neighborhood'] = Df.iloc[i]['Borough']

In [6]:
#More than one neighborhood can exist in one postal code area. For example, in the table on the 
#Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: 
#Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods 
#separated with a comma as shown in row 11 in the above table.

Df['new_column'] = Df.groupby(['PostalCode'])['Neighborhood'].transform(lambda x: ', '.join(x))
Df.drop_duplicates(subset = "PostalCode", inplace = True)
Df.drop(labels = "Neighborhood", axis = 1, inplace = True)
Df.columns = ["PostalCode", "Borough", "Neighborhood"]

In [7]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
Df.shape

(103, 3)

In [8]:
Df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Harbourfront, Regent Park"
7,M6A,North York,"Lawrence Heights, Lawrence Manor"
9,M7A,Queen's Park,Queen's Park


PART 2

In [9]:
#Read geodata and combine in ONE csv

Geodata = pd.read_csv("Geospatial_Coordinates.csv")
Geodata.columns = ["PostalCode", "Latitude", "Longitude"]
Df_geo = pd.merge(Df, Geodata, on = 'PostalCode')



In [10]:
#Result
Df_geo.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


PART 3

In [11]:
#Filter for boroughs that contain the word Toronto
num = []
for i in range(len(Df_geo)):
    a = "Toronto" in Df_geo.iloc[i]['Borough']
    if a == True:
        num = num + [i]

Df_geo_T = Df_geo.iloc[num]


In [12]:
Df_geo_T.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [13]:
#Latitude and logitude values of Toronto (source = latlong.net)
Latitude_toronto = 43.651070
Longitutde_toronto = -79.347015

In order to correctly use the Foursquare API, our dataframe needs to be adjusted. We will reverse the following:

"More than one neighborhood can exist in one postal code area. For example, in the table on the 
Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: 
Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods 
separated with a comma as shown in row 11 in the above table."


In [14]:
##Read data, filte rout "Not assigned" boroughs & neighborhoods, merge GEO data,
#Filter only for Toronto boroughs(just like part 1 & 2)
Df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
Df.columns = ["PostalCode", "Borough", "Neighborhood"]
Df.drop(labels = 0, axis = 0, inplace = True)

condition = Df['Borough'] != 'Not assigned'
Df = Df[condition]

for i in range(len(Df)):
    if Df.iloc[i]['Neighborhood'] == "Not assigned":
        Df.iloc[i]['Neighborhood'] = Df.iloc[i]['Borough']
        
Geodata = pd.read_csv("Geospatial_Coordinates.csv")
Geodata.columns = ["PostalCode", "Latitude", "Longitude"]
Df_geo_new = pd.merge(Df, Geodata, on = 'PostalCode')

num = []
for i in range(len(Df_geo_new)):
    a = "Toronto" in Df_geo_new.iloc[i]['Borough']
    if a == True:
        num = num + [i]

Df_geo_new_T = Df_geo_new.iloc[num]

In [15]:
Df_geo_new_T.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
13,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937
14,M5B,Downtown Toronto,Garden District,43.657162,-79.378937
27,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418


In [16]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[Latitude_toronto, Longitutde_toronto], zoom_start=10)

# add markers to map 
for lat, lng, borough, neighborhood in zip(Df_geo_new_T['Latitude'], Df_geo_new_T['Longitude'], Df_geo_new_T['Borough'], Df_geo_new_T['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto #Map of Toronto with neighborhoods in Boroughs that contain the word "Toronto"