#  Segmenting and Clustering Neighborhoods in Toronto

### Import Necessary Libraries:

In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
import requests
from sklearn.cluster import KMeans

### Get Neighbourhood Data:

In [2]:
html = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(html)
postal_codes = tables[0]

### Clean Data:

In [3]:
postal_codes.drop(postal_codes[postal_codes['Borough']=='Not assigned'].index, inplace=True)
postal_codes.Neighbourhood.replace("Not assigned", postal_codes.Borough, inplace=True)

In [4]:
print(postal_codes.shape)

(103, 3)


### Get Geographical Coordinates:

In [5]:
geo_cor = pd.read_csv("https://cocl.us/Geospatial_data")

### Add Latitudes and Longitudes to Postal Code Data:

In [6]:
postal_codes['Latitude'] = postal_codes['Postal Code'].map(geo_cor.set_index('Postal Code')['Latitude'])
postal_codes['Longitude'] = postal_codes['Postal Code'].map(geo_cor.set_index('Postal Code')['Longitude'])

### Visualize Toronto with Neighbourhoods in it:

In [7]:
# Find Toronto neighbourhoods with the word "Toronto" in them.
# toronto_neighbourhoods = postal_codes[postal_codes['Neighbourhood'].str.contains('Toronto')].reset_index(drop=True)

# Get geographical coordinates of Toronto.
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Create map of Toronto using latitude and longitude values.
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Add markers to map.
for lat, lng, label in zip(postal_codes['Latitude'],
                           postal_codes['Longitude'],
                           postal_codes['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credentials and Version:

In [8]:
CLIENT_ID = 'GJAFN4V5WYS5VV3HSOD5OFKRMWP1TD5MOULNEIQHYTIIWIBR'
CLIENT_SECRET = 'KUI3QBF3KDAW33UMVKMVCTVMZ2KV5EH511D00FBUUCZELUEC'
VERSION = '20180605'
limit = 100
radius = 500

### Create Function to Get Venues Near Neighbourhoods:

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Analyze Toronto Neighbourhoods:

In [10]:
toronto_venues = getNearbyVenues(names=postal_codes['Neighbourhood'],
                                 latitudes=postal_codes['Latitude'],
                                 longitudes=postal_codes['Longitude'])

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [11]:
# One hot encoding.
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Add neighborhood column back to dataframe.
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

### Cluster Neighbourhoods:

#### Run *k*-means to cluster the neighborhood into 5 clusters:

In [12]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

### Function to Get Most Common Venues in a Neighbourhood (Descending Order):

In [13]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

### Get The Top Ten Venues For Each Neighbourhood:

In [14]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# Create columns according to number of top venues.
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# Create a new dataframe.
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :],
                                                                          num_top_venues)

### Merge Clusters With Top Ten Venues:

In [15]:
# Add clustering labels.
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = postal_codes

# Merge toronto_grouped with postal_codes to add latitude & longitude for each neighbourhood.
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
toronto_merged.dropna(inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype('int32')

### Visualize Clusters:

In [16]:
# Create map.
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Add markers to map.
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'],
                                  toronto_merged['Longitude'],
                                  toronto_merged['Neighbourhood'],
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster - 1],
        fill=True,
        fill_color=rainbow[cluster - 1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters