# IBM Capstone W3, Clustering Toronto

#### Importing Required Packages

In [1]:

import pandas as pd
import numpy as np
import requests
import json
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
#conda install -c conda-forge folium=0.5.0
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

print("Hello Capstone Project Course W3!")

Hello Capstone Project Course W3!


#### Extracting Wikipedia Content

In [2]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_content = requests.get( wiki_url ).text
result = BeautifulSoup(html_content)


#### Initializing DataFrame

In [3]:
#Initial DataFrame
table_header = result.find_all("th")
col_names = []
for col in table_header:
    col_names.append( str(col)[4:-6] )
data_df = pd.DataFrame( columns = col_names[:3] )

#Extracting data
table_data = result.find_all("td")
numerator = range(0,len(table_data), 3 )
col1 = []; col2 = []; col3 = []
for i in numerator:
    if len(str(table_data[i])[4:-6]) < 5 :
        col1.append( str(table_data[i])[4:-6] )
        col2.append( str(table_data[i+1])[4:-6] )
        col3.append( str(table_data[i+2])[4:-6] )   
data_df.iloc[:,0] = col1
data_df.iloc[:,1] = col2
data_df.iloc[:,2] = col3

#Eliminating "Not assigned" Neighbourhood
data_df = data_df[ data_df["Borough"] != "Not assigned" ]
data_df[ data_df["Neighbourhood"] == "Not assigned" ]["Neighbourhood"] = \
data_df[ data_df["Neighbourhood"] == "Not assigned" ]["Borough"]
data_df.reset_index( drop=True, inplace=True )
print("data_df shape: ", data_df.shape)


data_df shape:  (103, 3)


#### Finalizing Canada DataFrame

In [4]:
geo_url = "http://cocl.us/Geospatial_data"
geo_coord = pd.read_csv( geo_url )
print( "geo_coord shape: ", geo_coord.shape )

data_df = pd.merge( data_df, geo_coord, on="Postal Code" )
print("Final df shape:", data_df.shape)

geo_coord shape:  (103, 3)
Final df shape: (103, 5)


#### Forming Toronto DataFrame

In [5]:
#Identifying toronto sections
torontos = []
t_bool_list = []
for b in data_df["Borough"].to_list():
    if ("Toronto" in  b) == True:
        t_bool_list.append(True)
    else:
        t_bool_list.append(False)

#DataFrame, toronto only
toronto_df = data_df[t_bool_list].reset_index(drop=True)
print( "toronto_df shape: ", toronto_df.shape )
toronto_df.head()

toronto_df shape:  (39, 5)


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


#### FSQ API Credientals

#### Obtaining Venue Data

In [8]:
#Nearby venues data collection from FSQ API
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=toronto_df['Neighbourhood'],
                                 latitudes=toronto_df['Latitude'],
                                 longitudes=toronto_df['Longitude']
                                )
print( "toronto_venues shape: ", toronto_venues.shape )
toronto_venues.head()

toronto_venues shape:  (1600, 7)


Unnamed: 0,Neighbourhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


#### Grouping, Venue Frequencies

In [9]:
#Dummies for each venue type
toronto_d = pd.get_dummies( toronto_venues[["Venue Category"]], prefix="", prefix_sep="" )
toronto_d["Neighbourhood"] = toronto_venues["Neighbourhood"]

#Relocating columns
toronto_d = toronto_d[[toronto_d.columns[-1]] + list(toronto_d.columns[:-1])]
print( "Toronto data with dummies shape: ",toronto_d.shape )

#Relative frequencies of venues per neigh
toronto_grouped = toronto_d.groupby("Neighbourhood").mean().reset_index()
print( "Toronto grouped df shape: ",toronto_grouped.shape )
toronto_grouped.head()

Toronto data with dummies shape:  (1600, 234)
Toronto grouped df shape:  (39, 234)


Unnamed: 0,Neighbourhood,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016393,0.0,0.0,0.016393,0.016393


#### Most Common Venues

In [10]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

print("Venues sorted shape: ", neighborhoods_venues_sorted.shape)
neighborhoods_venues_sorted.head()

Venues sorted shape:  (39, 11)


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Restaurant,Beer Bar,Farmers Market,Seafood Restaurant,Cheese Shop,Japanese Restaurant,Park
1,"Brockton, Parkdale Village, Exhibition Place",Café,Performing Arts Venue,Breakfast Spot,Coffee Shop,Bakery,Stadium,Burrito Place,Restaurant,Climbing Gym,Pet Store
2,"Business reply mail Processing Centre, South C...",Light Rail Station,Skate Park,Garden Center,Auto Workshop,Burrito Place,Fast Food Restaurant,Farmers Market,Garden,Pizza Place,Brewery
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Sculpture Garden,Harbor / Marina,Rental Car Location,Plane,Coffee Shop,Boat or Ferry,Boutique
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Salad Place,Thai Restaurant,Bubble Tea Shop,Burger Joint,Yoga Studio,Ramen Restaurant


#### Clustering Neighbors

In [11]:
#Clusters
n_cluster = 5
kmeans = KMeans( n_clusters=n_cluster, random_state=0 ).fit( toronto_grouped.drop( "Neighbourhood", axis=1, inplace=False ) )
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
#kmeans.labels_
toronto_ = toronto_df.join( neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood' )
toronto_.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,1,Coffee Shop,Park,Bakery,Breakfast Spot,Café,Pub,Theater,Yoga Studio,Shoe Store,Brewery
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Yoga Studio,Diner,Restaurant,Park,Mexican Restaurant,Japanese Restaurant,Italian Restaurant,Fried Chicken Joint
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,1,Clothing Store,Coffee Shop,Cosmetics Shop,Bubble Tea Shop,Café,Middle Eastern Restaurant,Japanese Restaurant,Italian Restaurant,Lingerie Store,Pizza Place
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,Café,Coffee Shop,American Restaurant,Gastropub,Cocktail Bar,Gym,Restaurant,Moroccan Restaurant,Cosmetics Shop,Creperie
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pub,Neighborhood,Trail,Health Food Store,Yoga Studio,Doner Restaurant,Discount Store,Distribution Center,Dog Run,Dumpling Restaurant


#### Map

In [12]:
#Map center
latitude = toronto_["Latitude"].unique().mean()
longitude = toronto_["Longitude"].unique().mean()

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(n_cluster)
ys = [i + x + (i*x)**2 for i in range(n_cluster)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

#Map markers
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_['Latitude'], toronto_['Longitude'], toronto_['Neighbourhood'], toronto_['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters