<h1>Segmenting and Clustering Neighborhoods in Toronto</h1>

Let's first prepare the Toronto data

In [2]:
!pip install lxml

import pandas as pd
import numpy as np
import lxml

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
data = pd.read_html(url, header=0)
df = data[0]

#rename the columns
df.rename(columns={'Postcode':'PostalCode','Neighbourhood':'Neighborhood'},inplace=True)

#delete the rows with unassigned Borough
df=df[df['Borough']!='Not assigned']

df=df.groupby(['PostalCode', 'Borough']).agg({'Neighborhood' : ','.join})

#We can see that Neighborhood has become the index of the dataframe, so we need to reset it for the next operations
df.reset_index(inplace=True)

#Next we replace the neighborhoods with unassigned values with the name of the borough as per assignment
df['Neighborhood'][df['Neighborhood']=='Not assigned']=df['Borough'][df['Neighborhood']=='Not assigned']

coords = pd.read_csv('https://cocl.us/Geospatial_data')

coords.rename(columns={'Postal Code':'PostalCode'},inplace=True)

df1 = pd.merge(df, coords, left_on=  ['PostalCode'],
            right_on= ['PostalCode'], 
            how = 'left')
df1.head()

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 30.8MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Lets import the required libraries

In [3]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    numpy-1.18.1               |   py36h95a1406_0         5.2 MB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        

Next lets get the coordinates for Toronto

In [4]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


Now lets create a map of Toronto and add markers for each neighborhood

In [5]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df1['Latitude'], df1['Longitude'], df1['Borough'], df1['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [6]:
# @hiddel_cell
CLIENT_ID =  # your Foursquare ID
CLIENT_SECRET =  # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [7]:
etobicoke_data = df1[df1['Borough'] == 'Etobicoke'].reset_index(drop=True)
etobicoke_data.head(7)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M8V,Etobicoke,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321
1,M8W,Etobicoke,"Alderwood,Long Branch",43.602414,-79.543484
2,M8X,Etobicoke,"The Kingsway,Montgomery Road,Old Mill North",43.653654,-79.506944
3,M8Y,Etobicoke,"Humber Bay,King's Mill Park,Kingsway Park Sout...",43.636258,-79.498509
4,M8Z,Etobicoke,"Kingsway Park South West,Mimico NW,The Queensw...",43.628841,-79.520999
5,M9B,Etobicoke,"Cloverdale,Islington,Martin Grove,Princess Gar...",43.650943,-79.554724
6,M9C,Etobicoke,"Bloordale Gardens,Eringate,Markland Wood,Old B...",43.643515,-79.577201


In [8]:
address_scar = 'Etobicoke,Toronto'
latitude_scar = 43.605647
longitude_scar = -79.501321
print('The geograpical coordinate of Etobicoke are {}, {}.'.format(latitude_scar, longitude_scar))

The geograpical coordinate of Etobicoke are 43.605647, -79.501321.


In [9]:

map_scarb = folium.Map(location=[latitude_scar, longitude_scar], zoom_start=12)

# add markers to map
for lat, lng, label in zip(etobicoke_data['Latitude'], etobicoke_data['Longitude'], etobicoke_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_scarb)  
    
map_scarb

In [10]:
neighborhood_longitude = etobicoke_data.loc[0, 'Longitude'] # neighbourhood longitude value
neighborhood_latitude = etobicoke_data.loc[0, 'Latitude'] # neighbourhood latitude value
neighborhood_name = etobicoke_data.loc[0, 'Neighborhood'] # neighbourhood name

print('Latitude and longitude values of "{}" are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of "Humber Bay Shores,Mimico South,New Toronto" are 43.6056466, -79.50132070000001.


In [11]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude_scar, longitude_scar, VERSION, radius, LIMIT)

In [12]:
results = requests.get(url).json()

In [13]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [14]:
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

venues = results['response']['groups'][0]['items']  
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,LCBO,Liquor Store,43.602281,-79.499302
1,New Toronto Fish & Chips,Restaurant,43.601849,-79.503281
2,Domino's Pizza,Pizza Place,43.601583,-79.500905
3,Delicia Bakery & Pastry,Bakery,43.601403,-79.503012
4,Lucky Dice Restaurant,Café,43.601392,-79.503056
5,Popeyes Louisiana Kitchen,Fried Chicken Joint,43.602069,-79.4994
6,McDonald's,Fast Food Restaurant,43.602464,-79.498859
7,Shoppers Drug Mart,Pharmacy,43.601677,-79.502239
8,Coffee Time,Coffee Shop,43.602284,-79.499857
9,Halibut House Fish and Chips Inc.,Seafood Restaurant,43.60196,-79.501147


In [15]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

15 venues were returned by Foursquare.


In [16]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [17]:
etobicoke_venues = getNearbyVenues(names=etobicoke_data['Neighborhood'],
                                   latitudes=etobicoke_data['Latitude'],
                                   longitudes=etobicoke_data['Longitude'])

Humber Bay Shores,Mimico South,New Toronto
Alderwood,Long Branch
The Kingsway,Montgomery Road,Old Mill North
Humber Bay,King's Mill Park,Kingsway Park South East,Mimico NE,Old Mill South,The Queensway East,Royal York South East,Sunnylea
Kingsway Park South West,Mimico NW,The Queensway West,Royal York South West,South of Bloor
Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park
Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe
Westmount
Kingsview Village,Martin Grove Gardens,Richview Gardens,St. Phillips
Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown
Northwest


In [18]:
etobicoke_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,LCBO,43.602281,-79.499302,Liquor Store
1,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,New Toronto Fish & Chips,43.601849,-79.503281,Restaurant
2,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,Domino's Pizza,43.601583,-79.500905,Pizza Place
3,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,Delicia Bakery & Pastry,43.601403,-79.503012,Bakery
4,"Humber Bay Shores,Mimico South,New Toronto",43.605647,-79.501321,Lucky Dice Restaurant,43.601392,-79.503056,Café


In [19]:
etobicoke_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Albion Gardens,Beaumond Heights,Humbergate,Jamestown,Mount Olive,Silverstone,South Steeles,Thistletown",9,9,9,9,9,9
"Alderwood,Long Branch",11,11,11,11,11,11
"Bloordale Gardens,Eringate,Markland Wood,Old Burnhamthorpe",7,7,7,7,7,7
"Cloverdale,Islington,Martin Grove,Princess Gardens,West Deane Park",1,1,1,1,1,1
"Humber Bay Shores,Mimico South,New Toronto",15,15,15,15,15,15
"Humber Bay,King's Mill Park,Kingsway Park South East,Mimico NE,Old Mill South,The Queensway East,Royal York South East,Sunnylea",2,2,2,2,2,2
"Kingsview Village,Martin Grove Gardens,Richview Gardens,St. Phillips",4,4,4,4,4,4
"Kingsway Park South West,Mimico NW,The Queensway West,Royal York South West,South of Bloor",15,15,15,15,15,15
Northwest,2,2,2,2,2,2
"The Kingsway,Montgomery Road,Old Mill North",3,3,3,3,3,3


In [21]:
print('There are {} uniques categories.'.format(len(etobicoke_venues['Venue Category'].unique())))

There are 44 uniques categories.


In [23]:
# one hot encoding
etob_onehot = pd.get_dummies(etobicoke_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
etob_onehot['Neighborhood'] = etobicoke_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [etob_onehot.columns[-1]] + list(etob_onehot.columns[:-1])
etob_onehot = etob_onehot[fixed_columns]

etob_grouped = etob_onehot.groupby('Neighborhood').mean().reset_index()
etob_grouped.head(7)

Unnamed: 0,Neighborhood,Athletics & Sports,Bakery,Baseball Field,Beer Store,Burger Joint,Burrito Place,Business Service,Café,Chinese Restaurant,Coffee Shop,Construction & Landscaping,Convenience Store,Cosmetics Shop,Dance Studio,Discount Store,Drugstore,Fast Food Restaurant,Flower Shop,Fried Chicken Joint,Golf Course,Grocery Store,Gym,Hardware Store,Intersection,Liquor Store,Mexican Restaurant,Mobile Phone Shop,Park,Pet Store,Pharmacy,Pizza Place,Pool,Pub,Rental Car Location,Restaurant,River,Sandwich Place,Seafood Restaurant,Skating Rink,Smoke Shop,Social Club,Supplement Shop,Thrift / Vintage Store,Wings Joint
0,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood,Long Branch",0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.181818,0.090909,0.090909,0.0,0.0,0.0,0.090909,0.0,0.090909,0.0,0.0,0.0,0.0,0.0
2,"Bloordale Gardens,Eringate,Markland Wood,Old B...",0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.142857,0.0,0.142857,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cloverdale,Islington,Martin Grove,Princess Gar...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Humber Bay Shores,Mimico South,New Toronto",0.0,0.066667,0.0,0.0,0.0,0.0,0.066667,0.133333,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.066667,0.0,0.0,0.066667,0.0,0.0,0.066667,0.066667,0.0,0.0,0.066667,0.066667,0.066667,0.0,0.0,0.0,0.066667,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0
5,"Humber Bay,King's Mill Park,Kingsway Park Sout...",0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,"Kingsview Village,Martin Grove Gardens,Richvie...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [25]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = etob_grouped['Neighborhood']

for ind in np.arange(etob_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(etob_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",Grocery Store,Pharmacy,Fried Chicken Joint,Beer Store,Fast Food Restaurant,Sandwich Place,Coffee Shop,Pizza Place,Flower Shop,Drugstore
1,"Alderwood,Long Branch",Pizza Place,Gym,Pub,Coffee Shop,Dance Studio,Pharmacy,Pool,Athletics & Sports,Sandwich Place,Skating Rink
2,"Bloordale Gardens,Eringate,Markland Wood,Old B...",Cosmetics Shop,Beer Store,Liquor Store,Café,Coffee Shop,Convenience Store,Pizza Place,Wings Joint,Fried Chicken Joint,Flower Shop
3,"Cloverdale,Islington,Martin Grove,Princess Gar...",Golf Course,Wings Joint,Thrift / Vintage Store,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store,Dance Studio,Cosmetics Shop
4,"Humber Bay Shores,Mimico South,New Toronto",Café,Gym,Restaurant,Fast Food Restaurant,Liquor Store,Mexican Restaurant,Pet Store,Pharmacy,Coffee Shop,Pizza Place
5,"Humber Bay,King's Mill Park,Kingsway Park Sout...",Construction & Landscaping,Baseball Field,Convenience Store,Golf Course,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store,Dance Studio
6,"Kingsview Village,Martin Grove Gardens,Richvie...",Pizza Place,Sandwich Place,Mobile Phone Shop,Park,Wings Joint,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store,Dance Studio
7,"Kingsway Park South West,Mimico NW,The Queensw...",Wings Joint,Hardware Store,Bakery,Burger Joint,Burrito Place,Convenience Store,Discount Store,Fast Food Restaurant,Flower Shop,Grocery Store
8,Northwest,Drugstore,Rental Car Location,Wings Joint,Construction & Landscaping,Fried Chicken Joint,Flower Shop,Fast Food Restaurant,Discount Store,Dance Studio,Cosmetics Shop
9,"The Kingsway,Montgomery Road,Old Mill North",Smoke Shop,River,Park,Wings Joint,Construction & Landscaping,Flower Shop,Fast Food Restaurant,Drugstore,Discount Store,Dance Studio
