# Segmenting and Clustering Neighborhoods in Toronto #

## Import Dependencies ##

In [122]:
#%% Import Dependencies
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

import wget #download files

print('Libraries imported.')

Libraries imported.


## Get Postal Codes Data from Wikipedia into DataFrame ##

In [123]:
#%% Get Postal Codes Data
df0 = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
df1=df0[0].copy()
#df1.head()

## Group Neighbourhoods by Borough ##

In [124]:
#%% Process Postal Codes Data
column_names=df1.iloc[0]
df1.columns=column_names
df1.drop(df1.index[0], inplace=True)

df1=df1[df1['Borough']!='Not assigned']
df1['Neighbourhood'][df1['Neighbourhood']=='Not assigned']=df1['Borough'][df1['Neighbourhood']=='Not assigned']

df2=df1.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x)).to_frame()
df2.reset_index(inplace=True)

#df2.shape

# Question 1: Postal Codes by Borough #

In [125]:
#Check Postal Code Data
print(df2.shape)
df2.head(11)

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## Download Toronto Neighbourhood Geo Data##

In [126]:
#%% Download Geo Data
url='https://cocl.us/Geospatial_data'
filename=wget.download(url)
filename

  0% [                                                            ]    0 / 2891100% [............................................................] 2891 / 2891

'Geospatial_Coordinates (2).csv'

# Question 2: Postal Codes & Geo Data by Borough #
## Merge Download Toronto Postal Code and Geo Data##

In [127]:
#Merge Postal Code and Geo Data

df3=pd.read_csv(filename)
df2=pd.concat([df2,df3], axis=1)
df2.drop(['Postal Code'], axis=1, inplace=True)
print(df2.shape)
df2.head(11)

(103, 5)


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Explode Neighbourhood Column into rows ##

In [128]:
#%% Neighbourhood Info

df4 = pd.DataFrame(df2.Neighbourhood.str.split(', ').tolist()).stack().to_frame()
df4.reset_index(inplace=True)
df4[1]=df2['Borough'].iloc[df4['level_0']].values
df4[2]=df2['Latitude'].iloc[df4['level_0']].values
df4[3]=df2['Longitude'].iloc[df4['level_0']].values

df4.drop(['level_0','level_1'], axis=1, inplace=True)
df4.columns=['Neighbourhood','Borough','Latitude','Longitude']
df4.head()

Unnamed: 0,Neighbourhood,Borough,Latitude,Longitude
0,Rouge,Scarborough,43.806686,-79.194353
1,Malvern,Scarborough,43.806686,-79.194353
2,Highland Creek,Scarborough,43.784535,-79.160497
3,Rouge Hill,Scarborough,43.784535,-79.160497
4,Port Union,Scarborough,43.784535,-79.160497


## Limit Analysis to Downtown Toronto only ##

In [129]:
df4=df4[df4['Borough']=='Downtown Toronto']
df4.head()
df4.shape

(37, 4)

## Get Toronto Coordinates ##

In [130]:
#%% Toronto Coordinates
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


## Create Toronto Test Map ##

In [131]:
#%% Test Map

# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df4['Latitude'], df4['Longitude'], df4['Borough'], df4['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

## Save Foursquare Login Info ##

In [132]:
#Foursquare Login Info
CLIENT_ID = 'EH5NVS3H0UK5L4J43OQX4U5VSVIGPSGEOL5CAES4EEPABSBO' # your Foursquare ID
CLIENT_SECRET = '1L1UWLPCTNWWSEKDAVDLNPDI043MFYXR4WZA3PIO051KXO0V' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EH5NVS3H0UK5L4J43OQX4U5VSVIGPSGEOL5CAES4EEPABSBO
CLIENT_SECRET:1L1UWLPCTNWWSEKDAVDLNPDI043MFYXR4WZA3PIO051KXO0V


## Define Function to get venues for each neighbourhood from Foursquare, Run Function ##

In [133]:
#Function to get venues in each neighbourhood
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [134]:
LIMIT=100
radius=500
toronto_venues = getNearbyVenues(names=df4['Neighbourhood'],
                                   latitudes=df4['Latitude'],
                                   longitudes=df4['Longitude']
                                  )

Rosedale
Cabbagetown
St. James Town
Church and Wellesley
Harbourfront
Ryerson
Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide
King
Richmond
Harbourfront East
Toronto Islands
Union Station
Design Exchange
Toronto Dominion Centre
Commerce Court
Victoria Hotel
Harbord
University of Toronto
Chinatown
Grange Park
Kensington Market
CN Tower
Bathurst Quay
Island airport
Harbourfront West
King and Spadina
Railway Lands
South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place
Underground city
Christie
Queen's Park


In [135]:
print(toronto_venues.shape)
toronto_venues.head()

(2437, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,Cabbagetown,43.667967,-79.367675,Butter Chicken Factory,43.667072,-79.369184,Indian Restaurant


## Group Venues to get List of Neighbourhoods ##
### Ensure no errors if a neighbourhood has no venues ###

In [178]:
df5=toronto_venues.groupby('Neighbourhood').mean()
#df5

In [179]:
df5.drop(['Venue Latitude','Venue Longitude'], axis=1, inplace=True)
df5.reset_index(inplace=True)
#print(df5.shape)
#df5

## Perform One Hot Encoding for each Venue Category, Get Average per Neighbourhood ##

In [154]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add Neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move Neighbourhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

#toronto_onehot.head()

In [155]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
#print(toronto_grouped.shape)
#toronto_grouped

## Top 10 venues for each Neighbourhood ##

In [156]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [158]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

#neighborhoods_venues_sorted.head()

## K-Means Clustering on Average Dataset ##

In [143]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 0, 1, 0, 1, 8, 7, 5, 1, 3])

## Add Cluster Labels to list of neighbourhoods##

In [180]:
# add clustering labels
df5.insert(0, 'Cluster Labels', kmeans.labels_)
df5.columns=['Cluster Labels','Neighbourhood0','Latitude','Longitude']
df5=pd.concat([df5,neighborhoods_venues_sorted], axis=1)


In [181]:
df5.drop(df5.columns[1], axis=1, inplace=True)
df5.head(10)

Unnamed: 0,Cluster Labels,Latitude,Longitude,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,3,43.650571,-79.384568,Adelaide,Coffee Shop,Steakhouse,Thai Restaurant,Café,Bar,Sushi Restaurant,Burger Joint,Cosmetics Shop,Bakery,Asian Restaurant
1,0,43.628947,-79.39442,Bathurst Quay,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden
2,1,43.644771,-79.373306,Berczy Park,Coffee Shop,Cheese Shop,Beer Bar,Seafood Restaurant,Farmers Market,Bakery,Steakhouse,Cocktail Bar,Café,French Restaurant
3,0,43.628947,-79.39442,CN Tower,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden
4,1,43.667967,-79.367675,Cabbagetown,Coffee Shop,Pizza Place,Restaurant,Pub,Café,Italian Restaurant,Bakery,Chinese Restaurant,Butcher,Plaza
5,8,43.657952,-79.387383,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Sandwich Place,Juice Bar,Burger Joint,Ice Cream Shop,Japanese Restaurant,Bakery,Department Store
6,7,43.653206,-79.400049,Chinatown,Bar,Café,Chinese Restaurant,Coffee Shop,Vietnamese Restaurant,Dumpling Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Dessert Shop,Burger Joint
7,5,43.669542,-79.422564,Christie,Grocery Store,Café,Park,Athletics & Sports,Gas Station,Italian Restaurant,Diner,Nightclub,Candy Store,Restaurant
8,1,43.66586,-79.38316,Church and Wellesley,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Fast Food Restaurant,Gym,Pizza Place,Pub,Men's Store
9,3,43.648199,-79.379817,Commerce Court,Coffee Shop,Café,Hotel,Restaurant,Gym,Italian Restaurant,Deli / Bodega,Seafood Restaurant,Gastropub,Bakery


# Question 3: Downtown Toronto Clustered Neighbourhoods #
## Create cluster map of neighbourhoods in Folium and Display Map##

In [102]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df5['Latitude'], df5['Longitude'], df5['Neighbourhood'], df5['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Examine 3 Clusters #

In [183]:
df5.loc[df5['Cluster Labels'] == 0, df5.columns[[3] + list(range(4, df5.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Bathurst Quay,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden
3,CN Tower,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden
17,Harbourfront West,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden
18,Island airport,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden
21,King and Spadina,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden
23,Railway Lands,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden
27,South Niagara,Airport Lounge,Airport Terminal,Harbor / Marina,Coffee Shop,Boutique,Boat or Ferry,Rental Car Location,Bar,Plane,Sculpture Garden


In [185]:
df5.loc[df5['Cluster Labels'] == 1, df5.columns[[3] + list(range(4, df5.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Berczy Park,Coffee Shop,Cheese Shop,Beer Bar,Seafood Restaurant,Farmers Market,Bakery,Steakhouse,Cocktail Bar,Café,French Restaurant
4,Cabbagetown,Coffee Shop,Pizza Place,Restaurant,Pub,Café,Italian Restaurant,Bakery,Chinese Restaurant,Butcher,Plaza
8,Church and Wellesley,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant,Fast Food Restaurant,Gym,Pizza Place,Pub,Men's Store
12,Garden District,Coffee Shop,Clothing Store,Japanese Restaurant,Café,Cosmetics Shop,Bakery,Electronics Store,Lingerie Store,Italian Restaurant,Middle Eastern Restaurant
26,Ryerson,Coffee Shop,Clothing Store,Japanese Restaurant,Café,Cosmetics Shop,Bakery,Electronics Store,Lingerie Store,Italian Restaurant,Middle Eastern Restaurant
28,St. James Town,Coffee Shop,Café,Restaurant,Bakery,Italian Restaurant,Breakfast Spot,Beer Bar,Thai Restaurant,Hotel,Clothing Store
29,Stn A PO Boxes 25 The Esplanade,Coffee Shop,Café,Hotel,Japanese Restaurant,Beer Bar,Restaurant,Seafood Restaurant,Italian Restaurant,Farmers Market,Art Gallery


In [184]:
df5.loc[df5['Cluster Labels'] == 2, df5.columns[[3] + list(range(4, df5.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
25,Rosedale,Park,Playground,Trail,Yoga Studio,Deli / Bodega,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Discount Store


In [186]:
df5.loc[df5['Cluster Labels'] == 3, df5.columns[[3] + list(range(4, df5.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Adelaide,Coffee Shop,Steakhouse,Thai Restaurant,Café,Bar,Sushi Restaurant,Burger Joint,Cosmetics Shop,Bakery,Asian Restaurant
9,Commerce Court,Coffee Shop,Café,Hotel,Restaurant,Gym,Italian Restaurant,Deli / Bodega,Seafood Restaurant,Gastropub,Bakery
10,Design Exchange,Coffee Shop,Café,Hotel,Restaurant,Italian Restaurant,Seafood Restaurant,Bar,Deli / Bodega,Gastropub,Steakhouse
11,First Canadian Place,Coffee Shop,Café,Restaurant,Steakhouse,American Restaurant,Burger Joint,Seafood Restaurant,Bar,Bakery,Deli / Bodega
20,King,Coffee Shop,Steakhouse,Thai Restaurant,Café,Bar,Sushi Restaurant,Burger Joint,Cosmetics Shop,Bakery,Asian Restaurant
24,Richmond,Coffee Shop,Steakhouse,Thai Restaurant,Café,Bar,Sushi Restaurant,Burger Joint,Cosmetics Shop,Bakery,Asian Restaurant
30,Toronto Dominion Centre,Coffee Shop,Café,Hotel,Restaurant,Italian Restaurant,Seafood Restaurant,Bar,Deli / Bodega,Gastropub,Steakhouse
32,Underground city,Coffee Shop,Café,Restaurant,Steakhouse,American Restaurant,Burger Joint,Seafood Restaurant,Bar,Bakery,Deli / Bodega
35,Victoria Hotel,Coffee Shop,Café,Hotel,Restaurant,Gym,Italian Restaurant,Deli / Bodega,Seafood Restaurant,Gastropub,Bakery
