# Segmenting and Clustering Neighborhoods in Toronto

In this Notebook I will cluster neighborhoods due to their similarites in local venues using the Foresquare API for a Coursera Capstone Project

1. Installing and importing all neccessary libaries

In [1]:
!pip install bs4
!pip install pgeocode
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files

#!pip install geopy
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import pgeocode #converting zipcodes to latitude and longitude

import requests # library to handle requests
from pandas import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!pip install folium
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1272 sha256=64ed0e9843b70caa853fc72f293558f5ecd0d2f0af3ab9368e5d53fd93e76500
  Stored in directory: /tmp/wsuser/.cache/pip/wheels/0a/9e/ba/20e5bbc1afef3a491f0b3bb74d508f99403aabe76eda2167ca
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1
Collecting pgeocode
  Downloading pgeocode-0.3.0-py3-none-any.whl (8.5 kB)
Installing collected packages: pgeocode
Successfully installed pgeocode-0.3.0
Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 5.4 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


## 2. Fetching neighborhoods in Toronto from Wiki

In [2]:
data_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" #Setting URL for fetching neighborhoods in Toronto
data = requests.get(data_url).text #get HTML content from Wiki
soup = BeautifulSoup(data, "html5lib") #creating a soup object
tables = soup.find_all('table') # find all tables
len(tables)

3

## 3. Getting the right table from the soup

In [3]:
for index,table in enumerate(tables): 
    if ("Toronto" in str(table)): #Getting the first table containing Toronto
        table_index = index
print(table_index)

0


## 4. Convert Data to Dataframe

In [4]:
toronto_data = pd.DataFrame(columns=["PostalCode", "Borough", "Neighborhood"])

for row in tables[table_index].tbody.find_all("tr"): #find all rows
    col = row.find_all("td") #find all columns in this row
    if (col != []): #skip emtpty rows
        if (col[1].text.strip() != "Not assigned"): #Ignore all Not assigned boroughs
            PostalCode = col[0].text.strip() #get PostalCode and strip html encoding
            Borough = col[1].text.strip() #get Borough and strip html encoding
            if (col[2].text.strip() != "Not assigned"): #Decide if to use Borough 
                Neighborhood = col[2].text.strip() #get Neighborhood and strip html encoding
            else: 
                Neighborhood = col[2].text.strip() #get Borough and use it as neighborhood as well and strip html encoding
            toronto_data = toronto_data.append({"PostalCode":PostalCode, "Borough":Borough, "Neighborhood":Neighborhood}, ignore_index=True)

toronto_data

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


## 5. Check if any "Not assigned" made it through: 

In [5]:
toronto_data[toronto_data["Borough"] == "Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighborhood


In [6]:
toronto_data[toronto_data["Neighborhood"] == "Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighborhood


## Answer 1
Check the shape of the generated Dataframe

In [7]:
toronto_data.shape

(103, 3)

In [8]:
nomi = pgeocode.Nominatim('ca') #Create a nominatim object for Canada


for index, row in toronto_data.iterrows(): #Iterate through rows
    location = nomi.query_postal_code("{}" .format(row['PostalCode'])) #Find Location based on postal code
    latitude = location.latitude #save latitude
    longitude = location.longitude #sove longitude
    toronto_data.at[index,'Longitude'] = longitude #Insert into dataframe
    toronto_data.at[index,'Latitude'] = latitude #Insert into dataframe

## Check for any NAN

In [9]:
toronto_data[toronto_data["Longitude"].isnull()]

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude
76,M7R,Mississauga,Canada Post Gateway Processing Centre,,


Since I wasn't able to fetch the data from pgeocode, I instead will insert it manually from the csv

In [10]:
 # M7R,43.6369656,-79.615819 is the data to add 
toronto_data.at[76,'Longitude'] = "-79.615819" #Insert into dataframe
toronto_data.at[76,'Latitude'] = "43.6369656"
    
toronto_data[toronto_data["Longitude"].isnull()] #Check again for NAN

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude


## Creating the Map containing the neighborhoods

In [11]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Now create a Dataframe containing all venues withing 500m radius for each neighboorhood

In [38]:
CLIENT_ID = 'KDSFSP3HPWKBZILAVTMJ4UGKT55APV315KBG4HR4BOLPJTDU' # your Foursquare ID
CLIENT_SECRET = 'JJTJQQKQ1ZVVRCL5APF3CHOHRZPG3W51AEENXXA0VHZZTK3Y' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                 latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [47]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.7545,-79.33,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.7545,-79.33,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.7545,-79.33,Yorkmills Wellness & Spa,43.7568,-79.325346,Spa
3,Victoria Village,43.7276,-79.3148,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.7276,-79.3148,Portugril,43.725819,-79.312785,Portuguese Restaurant


## Creating a Dataframe with numericdata for each venue (Solved by dummy variables)

In [17]:

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
col = toronto_onehot.pop("Neighborhood")
toronto_onehot.insert(0, "Neighborhood",col)


toronto_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Group by neughborhood and calculate the mean for every venue to get a neighboorhood 'score' based on venues near  by

In [19]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create a dataframe containint the 10 most common venues for each neighborhood

In [26]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Latin American Restaurant,Breakfast Spot,Badminton Court,Skating Rink,Newsagent,Yoga Studio,Eastern European Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant
1,"Alderwood, Long Branch",Pizza Place,Gym,Sandwich Place,Coffee Shop,Pub,Athletics & Sports,Convenience Store,Dance Studio,Donut Shop,Field
2,"Bathurst Manor, Wilson Heights, Downsview North",Pizza Place,Fried Chicken Joint,Mediterranean Restaurant,Middle Eastern Restaurant,Coffee Shop,Deli / Bodega,Eastern European Restaurant,Electronics Store,Escape Room,Ethiopian Restaurant
3,Bayview Village,Dog Run,Flower Shop,Gas Station,Park,Trail,Escape Room,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
4,"Bedford Park, Lawrence Manor East",Coffee Shop,Sandwich Place,Italian Restaurant,Indian Restaurant,Fast Food Restaurant,Butcher,Liquor Store,Café,Restaurant,Sushi Restaurant


## Now we'll cluster the neighboorhoods in 5 different groups and grouping them by their similiarity

In [27]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,-79.33,43.7545,2.0,Spa,Food & Drink Shop,Park,Doner Restaurant,Filipino Restaurant,Field,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space
1,M4A,North York,Victoria Village,-79.3148,43.7276,0.0,Hockey Arena,Intersection,Pizza Place,Park,Coffee Shop,Portuguese Restaurant,French Restaurant,Dumpling Restaurant,Eastern European Restaurant,Electronics Store
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",-79.3626,43.6555,0.0,Coffee Shop,Breakfast Spot,Yoga Studio,Theater,Gym / Fitness Center,Greek Restaurant,Italian Restaurant,Food Truck,Event Space,Electronics Store
3,M6A,North York,"Lawrence Manor, Lawrence Heights",-79.4504,43.7223,0.0,Clothing Store,Coffee Shop,Women's Store,Restaurant,Cosmetics Shop,Sandwich Place,Men's Store,Food Court,Jewelry Store,Juice Bar
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",-79.3889,43.6641,0.0,Sushi Restaurant,Italian Restaurant,Beer Bar,Burrito Place,Martial Arts School,Café,Escape Room,Mexican Restaurant,Chinese Restaurant,Ramen Restaurant


## Finally we create a map displaying all the different clusters

In [56]:
#Checking for NAN which will cause problems while creating the map. Solving NAN by dropping them

toronto_merged[toronto_merged["Cluster Labels"].isnull()]
toronto_merged.drop([88,95],inplace = True)
toronto_merged[toronto_merged["Cluster Labels"].isnull()]


Unnamed: 0,PostalCode,Borough,Neighborhood,Longitude,Latitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [60]:

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels'].astype('int32')):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# DONE