# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto
**by Anton Dziavitsyn 2019**

## TASK 1 - download table from wikipedia, extract data as described, and create dataframe

In [1]:
#imports
import pandas as pd
from bs4 import BeautifulSoup
import requests

#download page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

# extract table data
table_can_zipinfo = soup.find('table')
colvals = table_can_zipinfo.find_all('td')

elem_cnt = len(colvals)

data = [{
        'PostalCode': colvals[i].text.strip(),
        'Borough': colvals[i+1].text.strip(),
        'Neighborhood': colvals[i+2].text.strip()
    } for i in range(0, elem_cnt, 3)]

df_postcode = pd.DataFrame(data=data, columns = ['PostalCode', 'Borough', 'Neighborhood'])
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df_postcode.drop(df_postcode[df_postcode['Borough'] == 'Not assigned'].index, inplace=True)
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
# So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.
df_postcode.loc[df_postcode.Neighborhood == 'Not assigned', "Neighborhood"] = df_postcode.Borough
# More than one neighborhood can exist in one postal code area.
# For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.
# These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
df_postcode = df_postcode.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_postcode.columns = ['PostalCode', 'Borough', 'Neighborhood']

print('shape of dataframe: {0}'.format(df_postcode.shape))
print('first 10 rows:')
df_postcode.head(10)

shape of dataframe: (103, 3)
first 10 rows:


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## TASK 2 - add geolocation data columns

In [2]:
# download csv with coordinates
df_coord = pd.read_csv('http://cocl.us/Geospatial_data')
df_coord.columns = ['PostalCode', 'Latitude', 'Longitude']
# join to our dataframe
df_postcode = pd.merge(df_postcode, df_coord, on=['PostalCode'], how='inner')[['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']].copy()

print('first 10 rows:')
df_postcode.head(10)

first 10 rows:


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## TASK 3 - K-Mean clustering & map visualisation

In [3]:
#imports
from geopy.geocoders import Nominatim
import folium

#get Toronto location
address = 'Toronto, Canada'
geolocator = Nominatim()
toronto_location = geolocator.geocode(address)
print('Toronto coordinates are {}, {}.'.format(toronto_location.latitude, toronto_location.longitude))

# create map of Toronto
map_toronto = folium.Map(location=[toronto_location.latitude, toronto_location.longitude], zoom_start=10)

# filter pestcodes dataframe by having 'Toronto' in Borough
df_toronto = df_postcode[df_postcode['Borough'].str.contains('Toronto')]
print('shape of filtered dataframe: {0}'.format(df_toronto.shape))

# create map of Toronto
map_toronto = folium.Map(location=[toronto_location.latitude, toronto_location.longitude], zoom_start=10)

# add df_toronto markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  

# show df_toronto on map
map_toronto

  import sys


Toronto coordinates are 43.653963, -79.387207.
shape of filtered dataframe: (38, 5)


In [4]:
CLIENT_ID = 'MI12F15LBNREN0KL03I1ZHQG1JTWJ533FKV3JQQ15VNH2HEU' # you may use your Foursquare ID here
CLIENT_SECRET = 'P4C0X4VLLQG55BJUB3EKAFNQAPFB5WDXMVGVD3QUUTPRVBOZ' # you may use your Foursquare Secret here
VERSION = '20180605' # Foursquare API version

# function to get nearby venues by coordinates
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            10)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# get Toronto's venues
toronto_venues = getNearbyVenues(names=df_toronto['Neighborhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude'])

print('shape of toronto_venues: {0}'.format(toronto_venues.shape))
print('first 10 rows:')
toronto_venues.head()

shape of toronto_venues: (344, 7)
first 10 rows:


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
1,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
2,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
3,The Beaches,43.676357,-79.293031,Beaches Fitness,43.680319,-79.290991,Gym / Fitness Center
4,The Beaches,43.676357,-79.293031,Dip 'n Sip,43.678897,-79.297745,Coffee Shop


In [5]:
print('Venue count by Neighborhood (top 5 rows):')
toronto_venues.groupby('Neighborhood').count()['Venue'].head()

Venue count by Neighborhood (top 5 rows):


Neighborhood
Adelaide, King, Richmond                                                                                      10
Berczy Park                                                                                                   10
Brockton, Exhibition Place, Parkdale Village                                                                  10
Business Reply Mail Processing Centre 969 Eastern                                                             10
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara    10
Name: Venue, dtype: int64

In [6]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# generate mean quantity od category by Neighborhood (frequency occurance)
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
print('venue frequency in Neighborhood shape: {0}'.format(toronto_grouped.shape))
print('venue frequency in Neighborhood (top 5 rows):')
toronto_grouped.head()

venue frequency in Neighborhood shape: (38, 119)
venue frequency in Neighborhood (top 5 rows):


Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,...,Steakhouse,Supermarket,Sushi Restaurant,Swim School,Taco Place,Tea Room,Tennis Court,Thai Restaurant,Trail,Vegetarian / Vegan Restaurant
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.1,0.1,0.1,0.2,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Let's find top 10 venues for each neighborhood.

In [7]:
import numpy as np

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Steakhouse,Seafood Restaurant,Greek Restaurant,Coffee Shop,Plaza,Concert Hall,Hotel,Speakeasy,Noodle House,Eastern European Restaurant
1,Berczy Park,Steakhouse,Cocktail Bar,Italian Restaurant,Concert Hall,Liquor Store,Farmers Market,Beer Bar,Park,Museum,Thai Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Furniture / Home Store,Gym,Caribbean Restaurant,Pet Store,Café,Breakfast Spot,Italian Restaurant,Bar,Fast Food Restaurant
3,Business Reply Mail Processing Centre 969 Eastern,Farmers Market,Fast Food Restaurant,Skate Park,Brewery,Comic Shop,Auto Workshop,Burrito Place,Restaurant,Pizza Place,Garden Center
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Terminal,Boutique,Airport,Airport Food Court,Airport Gate,Harbor / Marina,Plane,Cosmetics Shop,Dessert Shop


### Run k-means to cluster the neighborhood into 5 clusters.

In [8]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
print('top 10 clustering labels: {0}'.format(kmeans.labels_[0:10]))

top 10 clustering labels: [1 2 1 2 2 3 1 3 3 2]


In [9]:
# Merge Toronto initial data with clustering results
toronto_clustered = df_toronto.sort_values(by='Neighborhood')

# add clustering labels
toronto_clustered['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_clustered = toronto_clustered.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

print('Clustered data shape: {0}'.format(toronto_grouped.shape))
print('Clustered (top 5 rows):')
toronto_clustered.head()

Clustered data shape: (38, 119)
Clustered (top 5 rows):


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
58,M5H,Downtown Toronto,"Adelaide, King, Richmond",43.650571,-79.384568,1,Steakhouse,Seafood Restaurant,Greek Restaurant,Coffee Shop,Plaza,Concert Hall,Hotel,Speakeasy,Noodle House,Eastern European Restaurant
56,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,2,Steakhouse,Cocktail Bar,Italian Restaurant,Concert Hall,Liquor Store,Farmers Market,Beer Bar,Park,Museum,Thai Restaurant
78,M6K,West Toronto,"Brockton, Exhibition Place, Parkdale Village",43.636847,-79.428191,1,Coffee Shop,Furniture / Home Store,Gym,Caribbean Restaurant,Pet Store,Café,Breakfast Spot,Italian Restaurant,Bar,Fast Food Restaurant
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,2,Farmers Market,Fast Food Restaurant,Skate Park,Brewery,Comic Shop,Auto Workshop,Burrito Place,Restaurant,Pizza Place,Garden Center
68,M5V,Downtown Toronto,"CN Tower, Bathurst Quay, Island airport, Harbo...",43.628947,-79.39442,2,Airport Lounge,Airport Terminal,Boutique,Airport,Airport Food Court,Airport Gate,Harbor / Marina,Plane,Cosmetics Shop,Dessert Shop


In [10]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# Generate clustered map
map_clusters = folium.Map(location=[toronto_location.latitude, toronto_location.longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_clustered['Latitude'], toronto_clustered['Longitude'], toronto_clustered['Neighborhood'], toronto_clustered['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters

## Thank You! Best regards, Anton Dziavitsyn 2019