In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## Question 1. Transform the data in the table on the Wikipedia page into the above pandas dataframe

In [34]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text #request page from Wikipedia

soup = BeautifulSoup(page,'html.parser') #arrange text file into html
table = soup.find_all('table')[0] #find table on the page
df = pd.read_html(str(table)) #read table into dataframe
df = pd.DataFrame(df[0]) #convert the table into DataFrame
df = df.drop([0], axis=0) #Drop the old column name which are values on the first row
df.rename(columns={0:'Postcode',1:'Borough',2:'Neighbourhood'},inplace=True) #rename columns from 0,1,2

df['Borough'].replace({'Not assigned':None},inplace=True) #replace 'Not assigned' to None in 'Borough' column
df_clean = df.dropna() #eliminate row with None
df_clean = df_clean.reset_index(drop=True) #reset index after elimination

postcode = df_clean['Postcode'].unique() #create array of unique postcode
postcode.sort() #sort the postcode

#Create a list, df_list, which includes the unique postcode and Borough and Neighbourhood in that area 
df_list = []
for i in postcode:
    dummy = df_clean[(df_clean['Postcode']== i)]
    BoroughDummy = dummy['Borough'].unique()
    NeighbourDummy = dummy['Neighbourhood'].unique()
    B = len(BoroughDummy)
    N = len(NeighbourDummy)
    BD = BoroughDummy[0]
    ND = NeighbourDummy[0]
    if ND == 'Not assigned': #if Neighbourhood is 'Not assigned', the value will be Borough name
        ND = BoroughDummy[0]
        
    #if there are more than one Borough and Neighbourhood in one postcode area, 
    #they will be put on the same cell, separated with comma   
    for b in range(1,B):
        BD = BD +', '+BoroughDummy[b]

    for n in range(1,N):
        ND = ND +', '+NeighbourDummy[n]

    df_list.append((i,BD,ND))

#convert df_list into DataFrame, df_group
df_group = pd.DataFrame(df_list, columns = df_clean.columns)
df_group.shape

(103, 3)

In [117]:
df_group.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Question 2. Get the latitude and the longitude coordinates of each neighborhood

In [10]:
import geocoder # import geocoder
print('geocoder imported!')

geocoder imported!


In [51]:
df_latlng = df_group[['Postcode','Borough','Neighbourhood']]
#df_latlng['Latitude'] = None
#df_latlng['Longitude'] = None

df_latlng.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [116]:
postal_code = 'M5G' #df_latlng['Postcode'][0]
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

In [118]:
#Download csv file for Latitude and Longitude
latlng_file = 'http://cocl.us/Geospatial_data'
latlng = pd.read_csv(latlng_file)
latlng.rename(columns={'Postal Code':'Postcode'},inplace=True)
latlng.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [120]:
#Merge the DataFrame of Postcode and Neibourhood and DataFrame of Geolocation
df_latlng = pd.merge(df_group,latlng, on='Postcode')
print(df_latlng.shape)
df_latlng

(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Question 3. Explore and cluster the neighborhoods in Toronto

In [121]:
#Create df_Toronto which contains only Postcode with 'Toronto' in 'Borough
df_Toronto = df_latlng[df_latlng['Borough'].str.contains('Toronto')]
print(df_Toronto.shape)
df_Toronto.head()

(38, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [122]:
#Find the Latitude and Longitude of Toronto
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [123]:
# set number of clusters
kclusters = 5

df_Toronto_clustering = df_Toronto.drop(['Borough','Neighbourhood','Postcode'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_Toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 1, 1, 1, 1, 1, 1], dtype=int32)

In [124]:
# add clustering labels
df_Toronto.insert(0, 'Cluster Labels', kmeans.labels_)
df_Toronto.head()

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
37,3,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,3,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,3,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,3,M4M,East Toronto,Studio District,43.659526,-79.340923
44,1,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [125]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Neighbourhood'], df_Toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters