### Week 3 - Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

import folium
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim

#### Webscrape Canada's list of postal codes from its wikipedia page

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)

In [3]:
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')

In [4]:
content_table = []

for row in table.findAll('td'):
    cell = {}
    if row.span.text == 'Not assigned':
        pass
    else:
        cell['Postalcode'] = row.b.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /', ',')).replace(')', ' ')).strip(' ')
        content_table.append(cell)

In [5]:
df = pd.DataFrame(content_table)
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [6]:
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                      'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                      'EtobicokeNorthwest':'Etobivoke Northwest',
                                      'East YorkEast Toronto':'East York/East Toronto',
                                      'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

In [7]:
print('Shape of the dataframe {}'.format(df.shape))

Shape of the dataframe (103, 3)


#### Order the dataframe to match the "Geospatial_Coordinates.csv" file

In [8]:
order_df = df.sort_values(by=['Postalcode'], ascending = True)
order_df.reset_index(drop = True, inplace = True)
order_df.head(5)

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
coordinates_df = pd.read_csv('Geospatial_Coordinates.csv')
coordinates_df.reset_index(drop = True, inplace = True)

In [10]:
Toronto_df = pd.concat([order_df, coordinates_df], axis = 1)
Toronto_df.drop(['Postal Code'], axis = 1, inplace = True)
Toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### Narrow my clustering task by looking at the overall count for each Borough

In [11]:
Toronto_df.groupby('Borough').count()

Unnamed: 0_level_0,Postalcode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,17,17,17,17
Downtown Toronto Stn A,1,1,1,1
East Toronto,4,4,4,4
East Toronto Business,1,1,1,1
East York,4,4,4,4
East York/East Toronto,1,1,1,1
Etobicoke,11,11,11,11
Etobivoke Northwest,1,1,1,1
Mississauga,1,1,1,1


#### Downtown Toronto was chosen for the clustering task

In [12]:
downtown_df = Toronto_df[Toronto_df['Borough'] == 'Downtown Toronto'].reset_index(drop = True)
downtown_df.head(17)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"St. James Town, Cabbagetown",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
5,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
6,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
7,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752


In [13]:
geolocator = Nominatim(user_agent = 'on_explorer')
location = geolocator.geocode('Downtown Toronto, ON')
latitude = location.latitude
longitude = location.longitude
print('Coordinates for Downtown Toronto are the following: {} / {}'.format(latitude, longitude))

Coordinates for Downtown Toronto are the following: 43.6563221 / -79.3809161


In [21]:
map_downtown = folium.Map(location = [latitude, longitude], zoom_start = 13)

for lat, lng, label in zip(downtown_df['Latitude'], downtown_df['Longitude'], downtown_df['Neighborhood']):
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
    [lat, lng],
    radius = 4,
    popup = label,
    color = 'blue',
    fill = True,
    fill_color = '#3186cc',
    fill_opacity = 0.7,
    parse_html = False).add_to(map_downtown)
    
map_downtown

#### Initiate Foursquare to obtain nearby venues

In [None]:
CLIENT_ID = 'NKIICUFP3F2YR2CNU31ATDOOGFMZE2MDEHC3ZORF3C0EUSUI'
CLIENT_SECRET = 'WMNZ2GATYAXECUDL4YZX0AEIMUBY4EHYNMYKALOU0FY3QAZR'
VERSION = '20200615'
LIMIT = 100

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius = 500):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
        
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                            'Neighborhood Latitude',
                            'Neighborhood Longitude',
                            'Venue',
                            'Venue Latitude',
                            'Venue Longitude',
                            'Venue Category']
    
    return nearby_venues

In [None]:
downtown_venues = getNearbyVenues(names = downtown_df['Neighborhood'], 
                                  latitudes = downtown_df['Latitude'],
                                  longitudes = downtown_df['Longitude'])

In [None]:
print(downtown_venues.shape)
downtown_venues.head(5)

In [None]:
print('There are {} uniques categories'.format(len(downtown_venues['Venue Category'].unique())))

#### One hot encode the venues obtained for each neighborhood

In [None]:
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix = "", prefix_sep = "")

downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood']

fixed_columns = [downtown_onehot.columns[-1]] + list(downtown_onehot.columns[:-1])
downtown_onehot = downtown_onehot[fixed_columns]
downtown_onehot.head(5)

In [None]:
downtown_grouped = downtown_onehot.groupby('Neighborhood').mean().reset_index()
downtown_grouped

In [None]:
num_top_venues = 5

for hood in downtown_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtown_grouped[downtown_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Discover the most common venues at each neighborhood

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# Create new columns based on the number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind + 1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind + 1))

neighborhoods_venues_sorted = pd.DataFrame(columns = columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_grouped['Neighborhood']

for ind in np.arange(downtown_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

#### Discover the best number to cluster Downtown Toronto

In [None]:
sum_of_squared_distances = []
kclusters = range(1,10)

downtown_grouped_clustering = downtown_grouped.drop('Neighborhood', 1)

for k in kclusters:
    kmeans = KMeans(n_clusters = k, random_state = 0).fit(downtown_grouped_clustering)
    sum_of_squared_distances.append(kmeans.inertia_)
    
plt.plot(kclusters, sum_of_squared_distances, 'bx-')
plt.xlabel('Cluster number')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method for discovering optimal cluster n')

#### Cluster each neighborhood based on their venues

In [None]:
kmeans = KMeans(n_clusters = 4, random_state = 0).fit(downtown_grouped_clustering)

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
downtown_merged = downtown_df

downtown_merged = downtown_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on = 'Neighborhood')
downtown_merged.head() 

#### Create a map showing each neighborhood's clustering label

In [None]:
downtown_map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_merged['Latitude'], downtown_merged['Longitude'], downtown_merged['Neighborhood'], downtown_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[int(cluster)-1],
        fill = True,
        fill_color = rainbow[int(cluster)-1],
        fill_opacity = 0.7).add_to(downtown_map_clusters)
       
downtown_map_clusters

#### Examine what each cluster contains

In [None]:
def cluster_examination(num_cluster, dataframe):
    cluster_info = dataframe.loc[dataframe['Cluster Labels'] == num_cluster, dataframe.columns[[1] + list(range(5, dataframe.shape[1]))]]
    return cluster_info

In [None]:
downtown_cluster1 = cluster_examination(0, downtown_merged)
downtown_cluster1.head()