In this notebook we apply a k-means procedure on Foursquare spatial data in order to cluster neighborhoods (or postal codes) of Toronto. We further demonstarate screep scraping from wikipedia using BewautifulSoup. 

some logistics:

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

# Toronto's Neighborhoods - screen scraping task
First we look for the different areas of Toronto, using the wikipedia url

In [4]:
wikipath="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
wikiurl=requests.get(wikipath).text
soup= BeautifulSoup(wikiurl, 'lxml')
my_table=soup.find('table',{'class':'wikitable sortable'}) #get the table
a=my_table.tbody.get_text().splitlines() #make the table list of strings  
while '' in a: #remove emplty lines 
    a.remove('')
column_names=a[:3] 
df = pd.DataFrame(np.array(a[3:]).reshape(-1,3),columns=column_names) #make the list a dataframe
df=df[df.Borough != 'Not assigned']
df.sort_values(by='Postcode',inplace=True)
df=df.reset_index(drop=True)
#if a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.Neighbourhood.loc[(df.Neighbourhood=='Not assigned')]=df.Borough 

del (a, column_names, wikipath, wikiurl)

rowN=df.shape[0]
row_base=0
post_base="XXX"
#place neighborhoods that share post code at the same row
for row_i in range(rowN):
    post_i=df.Postcode[row_i]
    if post_i==post_base:
        neigh_i=df.Neighbourhood[row_i]
        neigh_base=df.Neighbourhood[row_base]
        df.Neighbourhood[row_base]=neigh_base+", "+neigh_i
        df.Postcode[row_i]="DROPME"
    else:
        row_base=row_i
        post_base=post_i
      
df=df[df.Postcode != 'DROPME']
df=df.reset_index(drop=True)

del (rowN, row_i, row_base, post_i, neigh_i,neigh_base, post_base)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Geocoding / api 
now we will find the coordinates for each postcode using a geocoder
(since there were no matches we turned to a csv file)

In [5]:
import geocoder # import geocoder
import string 
rowN=df.shape[0]
df['Latitude']=0
df['Longitude']=0

for row_i in range(rowN):
    post_i=df.Postcode[row_i]
    bor_i=df.Borough[row_i]
    if bor_i.find("Toronto")!=-1:
        lookfor_i=post_i+", Toronto"
        print (lookfor_i)
        lat_lng_coords = None 
# loop until you get the coordinates
        iter=0  
        while(lat_lng_coords is None) and (iter<10):
            iter+=1
            g = geocoder.google(lookfor_i)
            lat_lng_coords = g.latlng
            if lat_lng_coords!= None:    
                df.Latitude = lat_lng_coords[0]
                df.Longitude = lat_lng_coords[1]
        #print("iterations: ", iter)        
        del (post_i,bor_i, lookfor_i, lat_lng_coords)
#since the code above was not able to retrieve the coordinates, I am using a csv file

csv_coord_path="http://cocl.us/Geospatial_data"
coord=pd.read_csv(csv_coord_path)  
df2=pd.merge(df[["Postcode","Borough","Neighbourhood"]], coord, left_on='Postcode', right_on='Postal Code', how='left')

df2.head()

M4E, Toronto
iterations:  10
M4K, Toronto
iterations:  10
M4L, Toronto
iterations:  10
M4M, Toronto
iterations:  10
M4N, Toronto
iterations:  10
M4P, Toronto
iterations:  10
M4R, Toronto
iterations:  10
M4S, Toronto
iterations:  10
M4T, Toronto
iterations:  10
M4V, Toronto
iterations:  10
M4W, Toronto
iterations:  10
M4X, Toronto
iterations:  10
M4Y, Toronto
iterations:  10
M5A, Toronto
iterations:  10
M5B, Toronto
iterations:  10
M5C, Toronto
iterations:  10
M5E, Toronto
iterations:  10
M5G, Toronto
iterations:  10
M5H, Toronto
iterations:  10
M5J, Toronto
iterations:  10
M5K, Toronto
iterations:  10
M5L, Toronto
iterations:  10
M5N, Toronto
iterations:  10
M5P, Toronto
iterations:  10
M5R, Toronto
iterations:  10
M5S, Toronto
iterations:  10
M5T, Toronto
iterations:  10
M5V, Toronto
iterations:  10
M5W, Toronto
iterations:  10
M5X, Toronto
iterations:  10
M6G, Toronto
iterations:  10
M6H, Toronto
iterations:  10
M6J, Toronto
iterations:  10
M6K, Toronto
iterations:  10
M6P, Toronto
i

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


# spatial data - venues from Forsquares 
I use the coordnates of the neighborhoods and the Forsquares api to find a 100 venues for each neighborhood. these will be used later for the unsupervised clustering process

retrieving the data using the coordinations:

In [6]:
import requests
import pandas as pd
CLIENT_ID = '1SXOLKYVC2R3TB32TT5Q2MNIHVXYVXYS0USOANDBQ22QQH40' # my Foursquare ID
CLIENT_SECRET = 'FAQEUCOAJUXSU2FIMP4DKIYQXMF34XC0ROX0ZVGBMNVIXTGT' # my Foursquare Secret (changed the values, you can use yours...)
VERSION = '20190618' # Foursquare API version

LIMIT=50 # we use up to 50 venues for each neighborhood
RADIUS=1000 # within a radius of 1 km from the center

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
"""
"""
toronto_venues = getNearbyVenues(names=df2['Neighbourhood'],
                                   latitudes=df2['Latitude'],
                                   longitudes=df2['Longitude'],
                                   radius=RADIUS
                                  )




Rouge, Malvern
Port Union, Rouge Hill, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Golden Mile, Oakridge, Clairlea
Cliffcrest, Scarborough Village West, Cliffside
Cliffside West, Birch Cliff
Wexford Heights, Dorset Park, Scarborough Town Centre
Maryvale, Wexford
Agincourt
Sullivan, Clarks Corners, Tam O'Shanter
Milliken, Agincourt North, L'Amoreaux East, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Oriole, Henry Farm
Bayview Village
Silver Hills, York Mills
Willowdale, Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South, Flemingdon Park
Wilson Heights, Downsview North, Bathurst Manor
Northwood Park, York University
Downsview East, CFB Toronto
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

Now we can inspect the output.. 
Let's see what are the most common venues:

In [7]:
Check_Venues = pd.DataFrame(toronto_venues['Venue Category'].value_counts())
Check_Venues.head(20)

Unnamed: 0,Venue Category
Coffee Shop,252
Café,143
Park,121
Pizza Place,118
Bakery,87
Italian Restaurant,85
Restaurant,73
Grocery Store,69
Sandwich Place,62
Fast Food Restaurant,61


we see that we've got some name issues - Cafe / Coffee Shop - practically the same. also different types of restaurants are in different cateories. this may be informative, but given the limitation of our sample size, I rather focus on broader categories. let me therefore group all different restaurants, and make a few additional changes (after manual inspection of all unique venues categories) 

In [8]:

toronto_venues['Venue Category'].replace(['Café'],['Coffee Shop'], inplace=True)
toronto_venues['Venue Category'].replace(['Supermarket'],['Grocery Store'], inplace=True)
toronto_venues['Venue Category'].replace(['Bar'],['Pub'], inplace=True)

Check_Venues = pd.DataFrame(toronto_venues['Venue Category'].value_counts())
Check_Venues["type"]=Check_Venues.index
for row_i in range (Check_Venues.shape[0]):
    type_i=Check_Venues["type"].iloc[row_i]
    if type_i.find("Restaurant") !=-1:
        toronto_venues['Venue Category'].replace([type_i],['Restaurant'], inplace=True)
    elif type_i.find(" Bar") !=-1:
        toronto_venues['Venue Category'].replace([type_i],['Pub'], inplace=True)   
    elif (type_i.find("Gym") !=-1) or (type_i.find("Yoga") !=-1) or (type_i.find("Dance") !=-1):
        toronto_venues['Venue Category'].replace([type_i],['Gym'], inplace=True)   

Check_Venues = pd.DataFrame(toronto_venues['Venue Category'].value_counts())
Check_Venues["type"]=Check_Venues.index
Check_Venues.head(20)

Unnamed: 0,Venue Category,type
Restaurant,833,Restaurant
Coffee Shop,395,Coffee Shop
Pub,151,Pub
Park,121,Park
Pizza Place,118,Pizza Place
Gym,112,Gym
Grocery Store,96,Grocery Store
Bakery,87,Bakery
Sandwich Place,62,Sandwich Place
Pharmacy,55,Pharmacy


# Unsupervised Clustering using k-means 

for the clustering I chose to use the 8 most common venue categories (from Restaurant to Grocery Stores) 
rows (neighbourhoods) will differ by the share of each of these venues.

let's build the dataframe requiered for the clustering

In [12]:
USE_TOP=8 #use only 8 main types
use=Check_Venues["type"][0:USE_TOP-1].reset_index().drop('index', axis=1)
use_vars=list(use.type)
use_vars.append('Neighborhood')
"""
"""
toronto_venues.groupby('Neighborhood').count()
"""
"""
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

"""
"""
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_reduced=toronto_grouped[use_vars]
fixed_columns = [toronto_reduced.columns[-1]] + list(toronto_reduced.columns[:-1])
toronto_reduced = toronto_reduced[fixed_columns]

toronto_reduced.head()

Unnamed: 0,Neighborhood,Restaurant,Coffee Shop,Pub,Park,Pizza Place,Gym,Grocery Store
0,Agincourt,0.444444,0.022222,0.0,0.022222,0.044444,0.0,0.066667
1,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.055556,0.055556,0.0,0.055556,0.166667,0.0,0.166667
2,Bayview Village,0.307692,0.076923,0.0,0.076923,0.0,0.0,0.076923
3,Berczy Park,0.16,0.1,0.1,0.04,0.02,0.02,0.0
4,Business Reply Mail Processing Centre 969 Eastern,0.1875,0.0625,0.041667,0.104167,0.041667,0.041667,0.0


Now we can go ahead and cluster
(see Cluster Labels column below)


In [16]:
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes 
import folium # map rendering library

# set number of clusters
kclusters = 4

print("Yo")

toronto_clustering = toronto_reduced.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering)
toronto_reduced.insert(0, 'Cluster Labels', kmeans.labels_)
df2=df2.rename(columns={"Neighbourhood": "Neighborhood"})

toronto_merged = df2

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_reduced.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Yo


Unnamed: 0,Postcode,Borough,Neighborhood,Postal Code,Latitude,Longitude,Cluster Labels,Restaurant,Coffee Shop,Pub,Park,Pizza Place,Gym,Grocery Store
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353,1.0,0.388889,0.111111,0.0,0.0,0.0,0.055556,0.0
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",M1C,43.784535,-79.160497,2.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711,2.0,0.130435,0.086957,0.043478,0.0,0.173913,0.0,0.043478
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917,0.0,0.375,0.25,0.0,0.125,0.0,0.0,0.0
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476,0.0,0.266667,0.1,0.0,0.0,0.033333,0.033333,0.033333


# create a map 
visualize the clusteing results using Folium 

In [17]:

map_clusters = folium.Map(location=[43.70, -79.42], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
#cluster labels need to be integers, not float
toronto_merged['Cluster Labels']=toronto_merged['Cluster Labels'].astype(int, errors='ignore')
#remove nan`s
toronto_merged=toronto_merged.dropna(subset=['Cluster Labels'])

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    cluster=int(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    #print("Lat=",lat,"Long=", lon, "Neigh=",poi, "cluster=", cluster)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters



Thank you.
Be kind