<h1 align=center><font size = 5>Segmenting and Clustering Neighborhoods in Toronto</font></h1>

In [1]:
import numpy as np

import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


import json 

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library



## Project description

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Download and Explore Dataset</a>

2. <a href="#item2">Clean Data</a>

3. <a href="#item2">Explore Neighbourhoods in Toronto</a>

4. <a href="#item3">Analyze Each Neighbourhood</a>

5. <a href="#item4">Cluster Neighbourhoods</a>

6.  <a href="#item5">Examine Clusters</a>    
</font>
</div>

## 1. Download and Explore Dataset

In [2]:
Data_health=pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_total_health_expenditure_per_capita')[5]
Data_health.head(190)

Unnamed: 0,Countries,2000,2005,2010,2015
0,Afghanistan,:,24,46,60
1,Albania,76,166,204,266
2,Algeria,61,100,228,292
3,Andorra,2051,4037,4596,4316
4,Angola,14,57,97,109
5,Antigua and Barbuda,381,490,673,657
6,Argentina,418,327,699,998
7,Armenia,41,116,169,366
8,Australia,1632,3004,4953,4934
9,Austria,2263,3676,4725,4536


In [3]:
Data_health.shape

(189, 5)

In [4]:

Data_Cost_Life=pd.read_html('https://www.numbeo.com/cost-of-living/rankings_by_country.jsp')[2]
Data_Cost_Life.head(140)

Unnamed: 0,Rank,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
0,,Cayman Islands,141.64,74.4,109.49,157.93,105.03,137.88
1,,Bermuda,138.22,87.62,114.03,124.87,150.4,79.87
2,,Switzerland,122.67,50.22,88.03,121.29,124.04,127.76
3,,Norway,104.49,37.26,72.34,93.94,115.59,98.0
4,,Us Virgin Islands,97.23,48.24,73.81,87.42,90.74,54.58
5,,Iceland,97.22,45.08,72.29,84.63,111.46,92.03
6,,Bahamas,85.96,35.4,61.79,68.84,87.2,56.21
7,,Japan,85.52,28.08,58.06,84.69,50.05,97.57
8,,Luxembourg,84.68,55.44,70.7,71.6,92.91,107.89
9,,Denmark,83.88,33.73,59.9,64.02,100.63,110.69


In [5]:
Data_Cost_Life.shape

(136, 8)

In [6]:
Data_Life_Expentancy=pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_life_expectancy')[1]
Data_Life_Expentancy.head(196)

Unnamed: 0_level_0,Rank,Country/Region,Life expectancy at birth (in years),Life expectancy at birth (in years),Life expectancy at birth (in years)
Unnamed: 0_level_1,Rank,Country/Region,Overall,Female,Male
0,1,Hong Kong,84.7,87.6,81.8
1,2,Japan,84.5,87.5,81.3
2,3,Switzerland,83.6,85.5,81.7
3,4,Singapore,83.5,85.6,81.3
4,5,Italy,83.4,85.4,81.1
5,5,Spain,83.4,86.1,80.7
6,7,Australia,83.3,85.3,81.3
7,8,Iceland,82.9,84.4,81.3
8,9,Israel,82.8,84.4,81.1
9,9,South Korea,82.8,85.8,79.7


In [7]:
Data_Life_Expentancy.shape

(196, 5)

In [8]:
Country_GeoData=pd.read_html('https://developers.google.com/public-data/docs/canonical/countries_csv')[0]
Country_GeoData.rename(columns={'name':'Country'}, inplace=True)
Country_GeoData.head(100)


Unnamed: 0,country,latitude,longitude,Country
0,AD,42.546245,1.601554,Andorra
1,AE,23.424076,53.847818,United Arab Emirates
2,AF,33.93911,67.709953,Afghanistan
3,AG,17.060816,-61.796428,Antigua and Barbuda
4,AI,18.220554,-63.068615,Anguilla
5,AL,41.153332,20.168331,Albania
6,AM,40.069099,45.038189,Armenia
7,AN,12.226079,-69.060087,Netherlands Antilles
8,AO,-11.202692,17.873887,Angola
9,AQ,-75.250973,-0.071389,Antarctica


## 2. Clean data

In [9]:
Data_health.reset_index(inplace=True)
Data_health.drop(columns=['index'], inplace=True)
Data_health.rename(columns={'Countries':'Country'}, inplace=True)
Data_health.replace(':', 0, inplace=True)
Data_health.loc[180, 'Country']='United States'
Data_health.head(200)

Unnamed: 0,Country,2000,2005,2010,2015
0,Afghanistan,0,24,46,60
1,Albania,76,166,204,266
2,Algeria,61,100,228,292
3,Andorra,2051,4037,4596,4316
4,Angola,14,57,97,109
5,Antigua and Barbuda,381,490,673,657
6,Argentina,418,327,699,998
7,Armenia,41,116,169,366
8,Australia,1632,3004,4953,4934
9,Austria,2263,3676,4725,4536


In [10]:
#Data_Cost_Life.drop(columns=['Rank'], inplace=True)
Data_Cost_Life.rename(columns={'Countries':'Country'}, inplace=True)
Data_Cost_Life.replace(':', 0, inplace=True)
Data_Cost_Life.head(100)

Unnamed: 0,Rank,Country,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
0,,Cayman Islands,141.64,74.4,109.49,157.93,105.03,137.88
1,,Bermuda,138.22,87.62,114.03,124.87,150.4,79.87
2,,Switzerland,122.67,50.22,88.03,121.29,124.04,127.76
3,,Norway,104.49,37.26,72.34,93.94,115.59,98.0
4,,Us Virgin Islands,97.23,48.24,73.81,87.42,90.74,54.58
5,,Iceland,97.22,45.08,72.29,84.63,111.46,92.03
6,,Bahamas,85.96,35.4,61.79,68.84,87.2,56.21
7,,Japan,85.52,28.08,58.06,84.69,50.05,97.57
8,,Luxembourg,84.68,55.44,70.7,71.6,92.91,107.89
9,,Denmark,83.88,33.73,59.9,64.02,100.63,110.69


In [11]:
Data_Life_Expentancy.columns = Data_Life_Expentancy.columns.droplevel()
Data_Life_Expentancy.replace(':', 0, inplace=True)
Data_Life_Expentancy.reset_index(0).reset_index(drop=True)
Data_Life_Expentancy.rename(columns={'Country/Region':'Country'}, inplace=True)
Data_Life_Expentancy.head(200)

Unnamed: 0,Rank,Country,Overall,Female,Male
0,1,Hong Kong,84.7,87.6,81.8
1,2,Japan,84.5,87.5,81.3
2,3,Switzerland,83.6,85.5,81.7
3,4,Singapore,83.5,85.6,81.3
4,5,Italy,83.4,85.4,81.1
5,5,Spain,83.4,86.1,80.7
6,7,Australia,83.3,85.3,81.3
7,8,Iceland,82.9,84.4,81.3
8,9,Israel,82.8,84.4,81.1
9,9,South Korea,82.8,85.8,79.7


In [12]:

Data_health=Data_health.join(Data_Life_Expentancy.set_index('Country'), on='Country', how='left', lsuffix='_left', rsuffix='_right')
Data_health=Data_health.join(Data_Cost_Life.set_index('Country'), on='Country', how='left', lsuffix='_left', rsuffix='_right')
Data_health=Data_health.join(Country_GeoData.set_index('Country'), on='Country', how='left', lsuffix='_left', rsuffix='_right')


In [13]:
Data_health.drop(columns=['Rank_left','Female', 'Male','Rank_right', 'Rent Index','Cost of Living Plus Rent Index', 'Groceries Index', 'Restaurant Price Index', 'Restaurant Price Index', 'Local Purchasing Power Index','country','2000','2005','2010'], inplace=True)
Data_health.head(110)

Unnamed: 0,Country,2015,Overall,Cost of Living Index,latitude,longitude
0,Afghanistan,60,64.5,24.51,33.93911,67.709953
1,Albania,266,78.5,36.63,41.153332,20.168331
2,Algeria,292,76.7,30.54,28.033886,1.659626
3,Andorra,4316,81.8,,42.546245,1.601554
4,Angola,109,60.8,,-11.202692,17.873887
5,Antigua and Barbuda,657,76.9,,17.060816,-61.796428
6,Argentina,998,76.5,38.72,-38.416097,-63.616672
7,Armenia,366,74.9,32.7,40.069099,45.038189
8,Australia,4934,83.3,73.39,-25.274398,133.775136
9,Austria,4536,81.4,72.15,47.516231,14.550072


In [14]:
Data_health.dropna(inplace=True)
Data_health.rename(columns={'2015':'Health Care Expenses','latitude':'Latitude','longitude':'Longitude'}, inplace=True)
Data_health.reset_index(inplace=True,drop=True)
Data_health.head(200)

Unnamed: 0,Country,Health Care Expenses,Overall,Cost of Living Index,Latitude,Longitude
0,Afghanistan,60,64.5,24.51,33.93911,67.709953
1,Albania,266,78.5,36.63,41.153332,20.168331
2,Algeria,292,76.7,30.54,28.033886,1.659626
3,Argentina,998,76.5,38.72,-38.416097,-63.616672
4,Armenia,366,74.9,32.7,40.069099,45.038189
5,Australia,4934,83.3,73.39,-25.274398,133.775136
6,Austria,4536,81.4,72.15,47.516231,14.550072
7,Azerbaijan,368,72.9,29.81,40.143105,47.576927
8,Bahamas,1685,73.8,85.96,25.03428,-77.39628
9,Bahrain,1190,77.2,58.36,25.930414,50.637772


In [15]:
Data_health.shape

(111, 6)

## 3. Explore neighbourhoods in Toronto

In [16]:
World_map = folium.Map(location=[0, 0], zoom_start=2)
color='blue'

# add markers to map
for lat, lng, country in zip(Data_health['Latitude'], Data_health['Longitude'] , Data_health['Country']):
    label = '{}'.format(country)
    label = folium.Popup(label, parse_html=True) 
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=color,
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(World_map)  
    
World_map

Map of Toronto with the Venues downloaded from Foursquare 

In [None]:
Toronto_data.loc[0, 'Neighbourhood']

In [None]:
neighborhood_latitude = Toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Toronto_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

## 4. Analize each Neighbourhood

In [18]:

LIMIT = 200 # limit of number of venues returned by Foursquare API

radius = 500 # define radius


url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url 

'https://api.foursquare.com/v2/venues/explore?&client_id=JMVF4CRSAGPKHIZBTLP1JASTXCWG2TXX2V54BMJ24X13HJZH&client_secret=3S4JISGI0RES14MOR25VSBKTP22E3U4BGVJXLY4DCCP25ZOY&v=20180605&ll=43.7532586,-79.3296565&radius=500&limit=200'

In [19]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5df616375fb726001bb4721a'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 2,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        'cc': 'CA',
        'c

In [20]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [21]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Variety Store,Food & Drink Shop,43.751974,-79.333114


In [22]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

2 venues were returned by Foursquare.


In [23]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [24]:
Toronto_venues = getNearbyVenues(names=Toronto_data['Neighbourhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude']
                                  )




Parkwoods
Victoria Village
Harbourfront
Lawrence Heights, Lawrence Manor
Queen's Park
Queen's Park
Rouge, Malvern
Don Mills North
Woodbine Gardens, Parkview Hill
Ryerson, Garden District
Glencairn
Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
Highland Creek, Rouge Hill, Port Union
Flemingdon Park, Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale


KeyError: 'groups'

In [25]:
print(Toronto_venues.shape)
Toronto_venues.head()

NameError: name 'Toronto_venues' is not defined

In [None]:
Toronto_venues.groupby('Neighbourhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

In [None]:

Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

Toronto_onehot['Neighbourhood'] = Toronto_venues['Neighbourhood'] 

fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

In [None]:
Toronto_onehot.shape

In [None]:
Toronto_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()


In [None]:
Toronto_grouped.shape

In [None]:
num_top_venues = 5

for hood in Toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = Toronto_grouped[Toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

## 5. Cluster Neighbourhoods

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = Toronto_grouped['Neighbourhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

In [None]:
kclusters = 8

Toronto_grouped_clustering = Toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
Toronto_merged = Toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_merged = Toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

Toronto_merged.head() # check the last columns!

Down below it is desplayed the map with the different designed clusters

In [None]:
# create map
map_clusters = folium.Map(location=[43.65, -79.35], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
Toronto_merged.dropna(inplace=True)
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighbourhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 6. Examine Clusters

After clustering between three and ten groups we have decided that the optimum is found at 4 different clusters

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 5, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 6, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 7, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 8, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]

In [None]:
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 9, Toronto_merged.columns[[1] + list(range(5, Toronto_merged.shape[1]))]]