# Clustering and Segmenting Toronto Neighborhoods

In [22]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')

## Scrape wikipedia page for table Data

In [2]:
my_table = soup.find('table',{'class':'wikitable sortable'})

In [3]:
table_rows = my_table.findAll('tr')

In [4]:
header = ['PostalCode', 'Borough', 'Neighborhood']
header

['PostalCode', 'Borough', 'Neighborhood']

In [5]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
neighborhoods = pd.DataFrame(l, columns=header)

In [6]:
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A\n,Not assigned\n,Not assigned\n
2,M2A\n,Not assigned\n,Not assigned\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n


## Clean up data

In [7]:
# Remove newlines in Neighborhood column
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].str.replace('\n', '')
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A\n,Not assigned\n,Not assigned
2,M2A\n,Not assigned\n,Not assigned
3,M3A\n,North York\n,Parkwoods
4,M4A\n,North York\n,Victoria Village


In [8]:
# Change all the unknowns to a common value
neighborhoods = neighborhoods.replace('Not assigned', None)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A\n,Not assigned\n,
2,M2A\n,Not assigned\n,
3,M3A\n,North York\n,Parkwoods
4,M4A\n,North York\n,Victoria Village


In [9]:
# Assign all unknown Neighborhood's with the Borough Value
neighborhoods['Neighborhood'].fillna(neighborhoods['Borough'], inplace=True)
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A\n,Not assigned\n,Not assigned\n
2,M2A\n,Not assigned\n,Not assigned\n
3,M3A\n,North York\n,Parkwoods
4,M4A\n,North York\n,Victoria Village


In [10]:
# Drop all rows with unassigned Borough
neighborhoods.dropna(how='any', axis='index', inplace=True)
neighborhoods.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A\n,Not assigned\n,Not assigned\n
2,M2A\n,Not assigned\n,Not assigned\n
3,M3A\n,North York\n,Parkwoods
4,M4A\n,North York\n,Victoria Village
5,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront"


In [11]:
# Group by Postal Code
neighborhoods = neighborhoods.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(lambda x: ','.join(x.astype(str))).reset_index()
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M1B\n,Scarborough\n,"Malvern, Rouge"
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
4,M1G\n,Scarborough\n,Woburn


## Output Shape & Rows

In [12]:
neighborhoods.shape

(180, 3)

In [13]:
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M1B\n,Scarborough\n,"Malvern, Rouge"
2,M1C\n,Scarborough\n,"Rouge Hill, Port Union, Highland Creek"
3,M1E\n,Scarborough\n,"Guildwood, Morningside, West Hill"
4,M1G\n,Scarborough\n,Woburn
...,...,...,...
175,M9V\n,Etobicoke\n,"South Steeles, Silverstone, Humbergate, Jamest..."
176,M9W\n,Etobicoke\n,"Northwest, West Humber - Clairville"
177,M9X\n,Not assigned\n,"The Kingsway, Montgomery Road, Old Mill North"
178,M9Y\n,Not assigned\n,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


# Get Postal Code Coordinates

### Coordinate Library is too unreliable, so code has been commented out

In [14]:
!pip install --upgrade geocoder
import geocoder

!pip install geopy
import geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json # library to handle JSON files

Requirement already up-to-date: geocoder in d:\apps\python38\lib\site-packages (1.38.1)


### Using a csv file with the coordinates for Toronto Postal Codes

In [15]:
coordinates = pd.read_csv('Geospatial_Coordinates.csv')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
# Merge coordinates with neighorhoods where the postalcodes match up
neighborhoods = pd.merge(coordinates, neighborhoods, how='left', left_on=['Postal Code'], right_on=['PostalCode'])

In [17]:
# Drop Duplicate Column
neighborhoods.drop(['PostalCode'], axis=1, inplace=True)

In [18]:
neighborhoods.head()

Unnamed: 0,Postal Code,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,,
1,M1C,43.784535,-79.160497,,
2,M1E,43.763573,-79.188711,,
3,M1G,43.770992,-79.216917,,
4,M1H,43.773136,-79.239476,,


## Lets map Toronto to see what the neighborhoods look like:

In [19]:
def generate_map(dataframe, longitude, latitude):
        map = folium.Map(location=[latitude, longitude], zoom_start=11)

        # add markers to map
        for lat, lng, borough, neighborhood in zip(dataframe['Latitude'], dataframe['Longitude'], dataframe['Borough'], dataframe['Neighborhood']):
            label = '{}, {}'.format(neighborhood, borough)
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
                [lat, lng],
                radius=5,
                popup=label,
                color='blue',
                fill=True,
                fill_color='#3186cc',
                fill_opacity=0.7,
                parse_html=False).add_to(map)  
            
        return map

In [20]:
latitude = 43.72
longitude = -79.347

# create map of New York using latitude and longitude values
map_neighborhoods = generate_map(neighborhoods, longitude, latitude)
map_neighborhoods

![Map of Toronto Neighborhoods](./images/toronto_neighborhoods.PNG)

## What if we just pick the boroughs with "Toronto" in it...

In [21]:
toronto_borough = neighborhoods[neighborhoods['Borough'].str.contains('Toronto')]
toronto_borough.reset_index(drop=True, inplace=True)
toronto_borough.head()

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [None]:
map_to = generate_map(toronto_borough, longitude, latitude)
map_to

![Map of Toronto Boroughs](./images/toronto_boroughs.PNG)

In [None]:
# @hidden_cell
# FOURSQUARE API CREDENTIALS
CLIENT_ID = '5N1I2ZBKG55NFLJQELOZI3XTLBUXXCF3FKTJQSZ2WXPZBWR5' # your Foursquare ID
CLIENT_SECRET = 'P5MPRQBUI4MRPYUYQHFHDT0CWEF3D3U1WTKBGKGTK0VXAIBU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

## Lets ask FourSquare for the venues around our first Borough

In [None]:
neighborhood_latitude = toronto_borough.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_borough.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_borough.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

In [None]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

In [None]:
results = requests.get(url).json()
results

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

## Nice. Now we can use a function to get venues for all of our neighborhoods

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(names=toronto_borough['Neighborhood'],
                                   latitudes=toronto_borough['Latitude'],
                                   longitudes=toronto_borough['Longitude']
                                  )

In [None]:
print(toronto_venues.shape)
toronto_venues.head()

In [None]:
toronto_venues.groupby('Neighborhood').count()

In [None]:
print('There are {} unique categories.'.format(len(toronto_venues['Venue Category'].unique())))

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)
toronto_onehot.head()

## Lets get the most common venues per neighborhood

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print(toronto_grouped.shape)
toronto_grouped

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
import numpy as np

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

### Now that we have some venues for each neighborhood, lets try and cluster them

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_borough

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

![Map of Toronto Boroughs Clustered by Venue Type](./images/k-means_clustered_boroughs.PNG)

## Now we can see what the clusters represent...

### Cluster 1: Edge of Toronto Restaurants & Drinks

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

### Cluster 1: Downtown/Midtown Restuarants & Drinks

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

### Cluster 2: Student Ammenities

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

### Cluster 3: Nature & Scenery

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

### Cluster 4: Rosedale

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

## Save to CSV

In [None]:
# Save to CSV File
toronto_merged.to_csv("toronto_merged.csv")