# Segmenting and Clustering Neighborhoods in Toronto Assignment

Scraping data from a Wikipedia page and loading it into a dataframe to be used for analysis and clustering 

In [1]:
# Libraries 
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geocoder --yes
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... 
  - anaconda/win-64::ca-certificates-2020.1.1-0, anaconda/win-64::openssl-1.1.1d-he774522_4
  - anaconda/win-64::openssl-1.1.1d-he774522_4, defaults/win-64::ca-certificates-2020.1.1-0
  - anaconda/win-64::ca-certificates-2020.1.1-0, defaults/win-64::openssl-1.1.1d-he774522_4
  - defaults/win-64::ca-certificates-2020.1.1-0, defaults/win-64::openssl-1.1.1d-he774522_4done

# All requested packages already installed.



usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab
usage: conda-script.py [-h] [-V] command ...
conda-script.py: error: unrecognized arguments: # uncomment this line if you haven't completed the Foursquare API lab


Libraries imported.


# Question 1 - Scraping Data from Wikipedia to a Dataframe

Load the data from the Wikipedia html link using the Pandas library

In [2]:
# Load the data into a list 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
toronto_data = pd.read_html(url)

Initialize the dataframe after reading the data and set the columns 

In [3]:
# Initialize the list into a dataframe
df_temp = toronto_data[0]

# Set the columns 
df_toronto = df_temp[["Postal Code","Borough","Neighborhood"]]

Drop "Not Assigned" boroughs from the dataframe and reset the index

In [4]:
# Only process cells with assigned boroughs, drop unassigned ones
df_toronto = df_toronto[df_toronto.Borough != "Not assigned"]
# After dropping, Reset the index
df_toronto.reset_index(inplace = True, drop = True)

Check to see if we can rename neighbourhoods whose name is "Not Assigned" - I see that there are none after dropping the Boroughs whose values are Not Assigned

In [5]:
# Process neighbourhoods that are unassigned, but have a borough name. We see there are none as the shape is the same
df_toronto = df_toronto[df_toronto.Neighborhood != "Not assigned"]

# If a postal code has multiple neighbourhoods, put them as a list 
# df_toronto['Neighborhood'] =df_toronto['Neighborhood'].str.split(",")

Print the dataframe shape, as well as the top 5 rows to see how our dataframe is looking

In [6]:
# Use .shape at the end to return the dataframe size 
print(df_toronto.shape)
df_toronto.head()

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Question 2 - Add columns for the location's latitude and longitude

Import geocoder libraries and initialize variables

In [7]:
# initialize your variable to None
lat_list = []
long_list = []

# Read in the geospatial dataset
df_geo = pd.read_csv("Geospatial_Coordinates.csv")

# Set the postal code as the index for this df 
df_geo.set_index("Postal Code",inplace=True)

df_geo.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


Go through each postal code in our df_toronto dataset and find its corresponding latitude and longitude

In [8]:
# Loop for all postal codes
# Get the corresponding latitude and longitude for each postal code
# Store it in a list 
for postal in df_toronto['Postal Code']:
    location = df_geo.loc[postal]
    latitude = location[0]
    longitude = location[1]
    lat_list.append(latitude)
    long_list.append(longitude)
len(lat_list)

103

Add our location data to our dataframe

In [9]:
df_toronto["Latitude"] = lat_list
df_toronto["Longitude"] = long_list
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Question 3 - Explore and Cluster the Neighborhoods in Toronto

In [10]:
df_toronto['Borough']

0            North York
1            North York
2      Downtown Toronto
3            North York
4      Downtown Toronto
5             Etobicoke
6           Scarborough
7            North York
8             East York
9      Downtown Toronto
10           North York
11            Etobicoke
12          Scarborough
13           North York
14            East York
15     Downtown Toronto
16                 York
17            Etobicoke
18          Scarborough
19         East Toronto
20     Downtown Toronto
21                 York
22          Scarborough
23            East York
24     Downtown Toronto
25     Downtown Toronto
26          Scarborough
27           North York
28           North York
29            East York
30     Downtown Toronto
31         West Toronto
32          Scarborough
33           North York
34           North York
35            East York
36     Downtown Toronto
37         West Toronto
38          Scarborough
39           North York
40           North York
41         East 