# Toronto neighborhood webscrape

## Section 1

In [1]:
# Third party imports
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests

In [2]:
# retrieve the appropriate web page
response = requests.get(
	url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M",
)
print(response.status_code)

200


In [3]:
# Use beautiful Soup to read the html
soup = BeautifulSoup(response.content, 'html.parser')
title = soup.find(id='firstHeading')
print(title.string)

List of postal codes of Canada: M


In [4]:
# Use beautiful soup to extract the table element and pass that into a list of values, ready to be turned into a dataframe
table_contents = []
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [5]:
# Pass values into a dataframe 
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace(
    {
        'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
        'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
        'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
        'MississaugaCanada Post Gateway Processing Centre':'Mississauga'
    }
)

In [6]:
df.shape

(103, 3)

In [7]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
    len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 15 boroughs and 103 neighborhoods.


## Section 2

In [8]:
import geocoder

Using the google geocode option was returning an error with <REQUEST DENIED>. I therefore switched to arcgis. 

In [9]:
LatLng_df = []
for i, row in df.iterrows():
    code = row.PostalCode
    LL = geocoder.arcgis(f'{row.PostalCode}, Toronto, Ontario').latlng
    LatLng_df.append({'PostalCode':code, 'Latitude':round(LL[0], 5), 'Longitude':round(LL[1], 5)})

LatLng_df = pd.DataFrame(LatLng_df)

In [10]:
toronto_df = pd.merge(df, LatLng_df, on="PostalCode")
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.75245,-79.32991
1,M4A,North York,Victoria Village,43.73057,-79.31306
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65512,-79.36264
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72327,-79.45042
4,M7A,Queen's Park,Ontario Provincial Government,43.66253,-79.39188


## Section 3

### Import libraries and load credentails 

In [11]:
import folium
from sklearn.cluster import KMeans
from dotenv import dotenv_values

config = dotenv_values('.env')

four_id = config['CLIENT_ID']
four_sc = config['CLIENT_SECRET']

In [12]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [13]:
map_Toronto = folium.Map(location=[43.7432, -79.3832], zoom_start=11)

for i, row in toronto_df.iterrows():
    label = folium.Popup(f'{row.Neighborhood} - {row.Borough}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto) 
map_Toronto


### What are the boroughs of Toronto and how many neighbourhoods are in each one?

In [14]:
toronto_df['Borough'].value_counts()

North York                24
Downtown Toronto          17
Scarborough               17
Etobicoke                 11
Central Toronto            9
West Toronto               6
York                       5
East Toronto               4
East York                  4
East York/East Toronto     1
East Toronto Business      1
Etobicoke Northwest        1
Mississauga                1
Downtown Toronto Stn A     1
Queen's Park               1
Name: Borough, dtype: int64

### North Youk contains the most entries (24), so lets focus in on that

In [15]:
york_df = toronto_df.loc[toronto_df.Borough == 'North York']

### Calculate the centre of the borough.

In [16]:
yorkLatMax = york_df.Latitude.max()
yorkLatMin = york_df.Latitude.min()
yorkLngMax = york_df.Longitude.max() 
yorkLngMin = york_df.Longitude.min()

yorkLat = (yorkLatMax+yorkLatMin)/2
yorkLng = (yorkLngMax+yorkLngMin)/2


### Lets bring back the top 100 results for locations in a 5 Km radius of the centre of the borough.

In [17]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    four_id, 
    four_sc, 
    20120609,
    # Centre search on North Your district 
    yorkLat, 
    yorkLng,
    # Search radious 500m
    5000, 
    # Top 100 results
    100)

results = requests.get(url).json()


In [18]:
# explore JSON to return details for individual places
# results['response']['groups'][0]['items'][0]

In [19]:
york_paces_df = pd.DataFrame(pd.json_normalize(results['response']['groups'][0]['items']))

In [20]:
# explore the dataframe 
# list(york_paces_df.columns)

In [21]:
# extract columns of interest from initial dataframe
york_paces_df = york_paces_df[['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]

In [22]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [23]:
york_paces_df['venue.categories'] = york_paces_df.apply(get_category_type, axis=1)

# rename columns
york_paces_df = york_paces_df.rename(columns={
    'venue.name': 'Name', 'venue.categories': 'Category', 
    'venue.location.lat':'Latitude', 'venue.location.lng': 'Longitude'})

In [24]:
# york_paces_df['Category'].value_counts()

In [25]:
york_paces_df.head()

Unnamed: 0,Name,Category,Latitude,Longitude
0,Earl Bales Park,Park,43.753043,-79.436228
1,Best for Bride,Bridal Shop,43.755789,-79.437834
2,Kinka Izakaya,Japanese Restaurant,43.760161,-79.409827
3,Escape Games Canada,Escape Room,43.753565,-79.464434
4,TD Canada Trust,Bank,43.756232,-79.439025


In [26]:
map_NorthYork = folium.Map(location=[yorkLat, yorkLng], zoom_start=13)

for i, row in york_df.iterrows():
    label = folium.Popup(f'{row.Neighborhood} - {row.Borough}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=10,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NorthYork) 
        
for i, row in york_paces_df.iterrows():
    label = folium.Popup(f'{row.Name} - {row.Category}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NorthYork) 

In [27]:
map_NorthYork

### Lets try breaking down and segmenting the Boroughs of Toronto. 
### First we will isolate only those with 'Toronto' in their name:

In [28]:
toronto_only_df = toronto_df.loc[toronto_df.Borough.str.contains('Toronto')]

In [29]:
toronto_only_df.shape

(39, 5)

In [30]:
toroLat = toronto_only_df.Latitude.mean()
toroLng = toronto_only_df.Longitude.mean()

In [31]:
map_TorontoOnly = folium.Map(location=[toroLat, toroLng], zoom_start=13)

for i, row in toronto_only_df.iterrows():
    label = folium.Popup(f'{row.Neighborhood} - {row.Borough}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=10,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_TorontoOnly) 
map_TorontoOnly

In [32]:
### Again we are going to investigate the surounding areas, pulling in the top 100 locations within 1.5 Km of each Borough. 

In [33]:
toronto_places_df = pd.DataFrame()

for i, row in toronto_only_df.iterrows():
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        four_id, 
        four_sc, 
        20120609,
        # Centre search on North Your district 
        row.Latitude, 
        row.Longitude,
        # Search radious 750m
        750, 
        # Top 100 results
        100)

    results = requests.get(url).json()

    holder = pd.DataFrame(pd.json_normalize(results['response']['groups'][0]['items']))
    holder = holder[['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]
    holder['venue.categories'] = holder.apply(get_category_type, axis=1)

    # rename columns
    holder = holder.rename(columns={
        'venue.name': 'Name', 'venue.categories': 'Category', 
        'venue.location.lat':'Latitude', 'venue.location.lng': 'Longitude'})
        
    toronto_places_df=pd.concat([toronto_places_df, holder])

### We dont want to count places more than once, something that will no doubt happen in more central Toronto, so we will drop the duplicates. 

In [34]:
toronto_places_df.shape

(2522, 4)

In [35]:
toronto_places_df = toronto_places_df.drop_duplicates()

In [36]:
toronto_places_df.shape

(1696, 4)

### Now lets put these on our map. 

In [37]:
map_TorontoOnly = folium.Map(location=[toroLat, toroLng], zoom_start=13)

for i, row in toronto_places_df.iterrows():
    label = folium.Popup(f'{row.Name} - {row.Category}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        parse_html=False).add_to(map_TorontoOnly) 

for i, row in toronto_only_df.iterrows():
    label = folium.Popup(f'{row.Neighborhood} - {row.Borough}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=10,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_TorontoOnly) 

map_TorontoOnly

### Lets try clustering these locations...

The KMeans class has many parameters that can be used, but we will use these three:

<ul>
    <li> <strong>init</strong>: Initialization method of the centroids. </li>
    <ul>
        <li> Value will be: "k-means++". k-means++ selects initial cluster centers for <em>k</em>-means clustering in a smart way to speed up convergence.</li>
    </ul>
    <li> <strong>n_clusters</strong>: The number of clusters to form as well as the number of centroids to generate. </li>
    <ul> <li> Value will be: 4 (since we have 4 centers)</li> </ul>
    <li> <strong>n_init</strong>: Number of times the <em>k</em>-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia. </li>
    <ul> <li> Value will be: 12 </li> </ul>
</ul>

Initialize KMeans with these parameters, where the output parameter is called **k_means**.

In [38]:
k_means = KMeans(init="k-means++", n_clusters=4, n_init=12)

Using the Latitude and Longitude values of the venues:

In [39]:
X = toronto_places_df[['Latitude', 'Longitude']].to_numpy()

In [40]:
k_means.fit(X)

KMeans(n_clusters=4, n_init=12)

Now let's grab the labels for each point in the model using KMeans **.labels_** attribute and save it as **k_means_labels**.
We will also get the coordinates of the cluster centers using KMeans **.cluster_centers_** and save it as **k_means_cluster_centers**.



In [41]:
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_


### Apply the clister lables back to the places df. 

In [42]:
toronto_places_df['kluster'] = k_means_labels

In [43]:
### Replot the map of Toronto with the new district clusters. 

In [44]:
map_TorontoKluster = folium.Map(location=[toroLat, toroLng], zoom_start=13)

kluster_colours = ['red', 'purple', 'cyan', 'yellow']

for i, row in toronto_places_df.iterrows():
    label = folium.Popup(f'{row.Name} - {row.Category}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=5,
        popup=label,
        color=kluster_colours[row.kluster],
        fill=True,
        fill_opacity=0.7,
        parse_html=False).add_to(map_TorontoKluster) 

for i, row in toronto_only_df.iterrows():
    label = folium.Popup(f'{row.Neighborhood} - {row.Borough}')
    folium.CircleMarker(
        [row.Latitude, row.Longitude],
        radius=10,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_TorontoKluster) 

for i in range(len(k_means_cluster_centers)):
    folium.CircleMarker(
        [k_means_cluster_centers[i][0], k_means_cluster_centers[i][1]],
        radius=10,
        color='black',
    ).add_to(map_TorontoKluster) 

map_TorontoKluster